From e2226cd4fdf2203693d1bf3a04af1fdc3c82de5d Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 19 May 2023 14:47:26 -0400
Subject: [PATCH 01/18] add docker_usage_sum.py

---
 scripts/docker/docker_usage_sum.py | 216 +++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 scripts/docker/docker_usage_sum.py

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
new file mode 100644
index 000000000..c335e0c02
--- /dev/null
+++ b/scripts/docker/docker_usage_sum.py
@@ -0,0 +1,216 @@
+import os
+import re
+import urllib.request
+from urllib.error import HTTPError, URLError
+import json
+
+
+# A script to collect which dockers are in use and which latest dockers are available
+
+def main():
+    dir = os.path.abspath(os.path.dirname(__file__))
+    # os.chdir(dir)
+
+    print("COLLECTING DOCKERS IN USE...")
+    # os.chdir("../../wdl")
+    wdls_dir = os.path.join(dir, "../../wdl")
+    if os.path.exists("dockers.in_use.tsv"):
+        os.remove("dockers.in_use.tsv")
+
+    wdl_files = get_wdl_files(dir_to_wdls=wdls_dir)
+    global_docker_info = []
+
+    for wdl_path in wdl_files:
+
+        wdl_name = wdl_path
+
+        with open(wdl_path, "r") as file:
+            content = file.read()
+            pattern = re.compile(r'.*docker.*"')
+            if pattern.search(content):
+                matched_lines = []
+                file.seek(0)
+                lines = file.readlines()
+
+                for line_number, line in enumerate(lines, start=1):
+                    if pattern.search(line):
+                        matched_lines.append((line_number, line.strip()))
+
+                docker_info: list[str] = get_docker_info_from_string(
+                    wdl_lines=matched_lines, wdl_name=wdl_name
+                )
+
+                sorted_info: list = sorted(docker_info, reverse=False)
+
+                global_docker_info.append(sorted_info)
+
+    with open("dockers.in_use.tsv", "a") as tsv_file:
+        tsv_file.write(f"name\tused_tag\tlatest_tag\tline\twdl\n")
+        for line in sorted(global_docker_info):
+            tsv_file.write("\n".join(line) + "\n")
+
+    print("DONE. PLEASE CHECKOUT TSV FILE: dockers.in_use.tsv")
+    # os.chdir(dir)
+    # os.rename("../../wdl/dockers.in_use.tsv", "dockers.in_use.tsv")
+
+
+def get_wdl_files(dir_to_wdls: str) -> list:
+    """
+    Returns a list of wdl files
+    @return:
+    """
+    wdl_files = []
+    for root, _, files in os.walk(dir_to_wdls):
+        for filename in files:
+            if filename.endswith(".wdl"):
+                wdl_path = os.path.join(root, filename)
+                wdl_files.append(wdl_path)
+
+    return wdl_files
+
+
+def get_docker_info_from_string(wdl_lines: [tuple], wdl_name: str) -> list:
+    """
+    Returns a list of docker info
+    @param wdl_name:
+    @param wdl_lines: (line_number, line_content)
+    @return:
+    """
+    docker_detail = []
+
+    for line_num, line_content in wdl_lines:
+        docker_names = re.findall(r'docker.*"(\S*?)"', line_content)
+        if docker_names:
+            docker_name = docker_names[0]
+            used_tag = os.path.basename(docker_name).split(":")[1]
+            docker_path = docker_name.split(":")[0]
+            latest_tag = get_latest_local_docker_tag(docker_path)
+            latest_tag = get_latest_remote_docker_tag(
+                docker_path) if latest_tag == "NA" else latest_tag
+            docker_detail.append(
+                f"{docker_path}\t{used_tag}\t{latest_tag}\t{line_num}\t{wdl_name}")
+        else:
+            pass
+
+    return docker_detail
+
+
+def get_latest_remote_docker_tag(docker_path: str) -> str:
+    """
+    Returns the latest tag of a docker
+    @param docker_path:
+    @return:
+    """
+    if "gcr" in docker_path:
+        latest_tag = get_latest_tag_from_gcr(docker_path)
+    elif "quay.io" in docker_path:
+        latest_tag = get_latest_tag_from_quayio(docker_path)
+    else:
+        latest_tag = get_latest_tag_from_duckerhub(docker_path)
+    return latest_tag
+
+
+def get_latest_tag_from_duckerhub(docker_path: str) -> str:
+    image_name = docker_path
+    registry_url = f"https://registry.hub.docker.com/v2/repositories/{image_name}/tags/?page_size=1&ordering=last_updated"
+    try:
+        with urllib.request.urlopen(registry_url) as response:
+            data = response.read().decode("utf-8")
+            json_data = json.loads(data)
+            tags = json_data.get("results")
+            if tags:
+                latest_tag = tags[0].get("name")
+                return latest_tag
+            else:
+                return "NA"
+    except urllib.error.HTTPError as e:
+        # print(f"Error: {e.code} - {e.reason}")
+        pass
+    except urllib.error.URLError as e:
+        # print(f"Error: Failed to reach the server - {e.reason}")
+        pass
+
+
+def get_latest_tag_from_gcr(docker_path: str) -> str:
+    # Split the image string into project ID and image name
+    parts = docker_path.split("/")
+    gcr_repo = parts[0]
+    project_id = parts[1]
+    image_name = "/".join(parts[2:])
+    # Construct the URL for retrieving tags
+    registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list"
+
+    try:
+        # Send the GET request to the Container Registry API
+        with urllib.request.urlopen(registry_url) as response:
+            data = response.read().decode("utf-8")
+            json_data = json.loads(data)
+            tags = json_data.get("tags")
+            if tags:
+                latest_tag = max(tags)
+                return latest_tag
+            else:
+                return "NA"
+    except urllib.error.HTTPError as e:
+        # print(f"Error: {e.code} - {e.reason}")
+        pass
+    except urllib.error.URLError as e:
+        # print(f"Error: Failed to reach the server - {e.reason}")
+        pass
+
+
+def get_latest_tag_from_quayio(docker_path: str) -> str:
+    # Split the image string into project ID and image name
+    parts = docker_path.split("/")
+    quayio_repo = parts[0]
+    project_id = parts[1]
+    image_name = "/".join(parts[2:])
+    # Construct the URL for retrieving tags
+    registry_url = f"https://{quayio_repo}/v2/{project_id}/{image_name}/tags/list"
+
+    try:
+        # Send the GET request to the Container Registry API
+        with urllib.request.urlopen(registry_url) as response:
+            data = response.read().decode("utf-8")
+            json_data = json.loads(data)
+            tags = json_data.get("tags")
+            if tags:
+                latest_tag = max(tags)
+                return latest_tag
+            else:
+                return "NA"
+    except urllib.error.HTTPError as e:
+        # print(f"Error: {e.code} - {e.reason}")
+        pass
+    except urllib.error.URLError as e:
+        # print(f"Error: Failed to reach the server - {e.reason}")
+        pass
+
+
+def get_latest_local_docker_tag(docker_path: str) -> str:
+    """
+    Returns the latest tag of a docker
+    @param docker_path:
+    @return:
+    """
+    docker_name = os.path.basename(docker_path)
+    docker_dir = "../docker"
+    latest_tag = "NA"
+
+    for docker_im_dir in os.listdir(docker_dir):
+        if docker_im_dir == docker_name:
+            docker_dir_path = os.path.join(docker_dir, docker_im_dir)
+            for makefile in os.listdir(docker_dir_path):
+                if not makefile.endswith("Makefile"):
+                    continue
+
+                with open(os.path.join(docker_dir_path, makefile)) as f:
+                    for makefile_line in f:
+                        if "VERSION =" in makefile_line:
+                            latest_tag = makefile_line.split("=")[1].strip()
+
+    return latest_tag
+
+
+if __name__ == "__main__":
+    main()

From b9165b7c071afad85dea49ac101e13d857e29ef1 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 19 May 2023 17:54:59 -0400
Subject: [PATCH 02/18] updates made to docker_usage_sum.py

---
 scripts/docker/docker_usage_sum.py | 151 +++++++++++++++++++++++------
 1 file changed, 124 insertions(+), 27 deletions(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index c335e0c02..e9ce2842b 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -1,26 +1,36 @@
 import os
 import re
+import subprocess
 import urllib.request
 from urllib.error import HTTPError, URLError
 import json
 
 
 # A script to collect which dockers are in use and which latest dockers are available
+# Usage: python3 docker_usage_sum.py
+# Output: dockers.in_use.tsv
+# Note: This script is not perfect. It will not be able to detect dockers that are
+#       imported from other wdl files. It will only detect dockers that are
+#       explicitly defined in the wdl file.
+#       The script assumes it is executed from the scripts/docker directory, and the
+#       wdl files are in ../../wdl directory.
 
 def main():
-    dir = os.path.abspath(os.path.dirname(__file__))
-    # os.chdir(dir)
+    current_dir = os.path.abspath(os.path.dirname(__file__))
 
     print("COLLECTING DOCKERS IN USE...")
-    # os.chdir("../../wdl")
-    wdls_dir = os.path.join(dir, "../../wdl")
-    if os.path.exists("dockers.in_use.tsv"):
-        os.remove("dockers.in_use.tsv")
+    wdls_dir = os.path.abspath(os.path.join(current_dir, "../../wdl"))
+    sum_tsv_file = os.path.join(current_dir, "dockers.in_use.tsv")
+
+    if os.path.exists(sum_tsv_file):
+        os.remove(sum_tsv_file)
 
     wdl_files = get_wdl_files(dir_to_wdls=wdls_dir)
     global_docker_info = []
 
-    for wdl_path in wdl_files:
+    total_files = len(wdl_files)  # Used for Progression calculation
+
+    for index, wdl_path in enumerate(wdl_files, start=1):
 
         wdl_name = wdl_path
 
@@ -37,21 +47,26 @@ def main():
                         matched_lines.append((line_number, line.strip()))
 
                 docker_info: list[str] = get_docker_info_from_string(
-                    wdl_lines=matched_lines, wdl_name=wdl_name
+                    wdl_lines=matched_lines, wdl_path=wdl_name
                 )
 
                 sorted_info: list = sorted(docker_info, reverse=False)
 
                 global_docker_info.append(sorted_info)
 
-    with open("dockers.in_use.tsv", "a") as tsv_file:
-        tsv_file.write(f"name\tused_tag\tlatest_tag\tline\twdl\n")
+        # Progression
+        # Calculate the percentage completion
+        progress = (index + 1) / total_files * 100
+
+        # Clear the previous line and print the progress
+        print(f"Progress: {progress:.2f}%\r", end="")
+
+    with open(sum_tsv_file, "a") as tsv_file:
+        tsv_file.write(f"DOCKER_NAME\tUSED_TAG\tLATEST_TAG\tFILE_LINE\tWDL_PATH\n")
         for line in sorted(global_docker_info):
             tsv_file.write("\n".join(line) + "\n")
 
-    print("DONE. PLEASE CHECKOUT TSV FILE: dockers.in_use.tsv")
-    # os.chdir(dir)
-    # os.rename("../../wdl/dockers.in_use.tsv", "dockers.in_use.tsv")
+    print(f"DONE. PLEASE CHECKOUT TSV FILE: {sum_tsv_file}")
 
 
 def get_wdl_files(dir_to_wdls: str) -> list:
@@ -69,15 +84,17 @@ def get_wdl_files(dir_to_wdls: str) -> list:
     return wdl_files
 
 
-def get_docker_info_from_string(wdl_lines: [tuple], wdl_name: str) -> list:
+def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list:
     """
     Returns a list of docker info
-    @param wdl_name:
+    @param wdl_path:
     @param wdl_lines: (line_number, line_content)
     @return:
     """
     docker_detail = []
 
+    wdl_path_sum = wdl_path[wdl_path.find("/wdl/"):]
+
     for line_num, line_content in wdl_lines:
         docker_names = re.findall(r'docker.*"(\S*?)"', line_content)
         if docker_names:
@@ -88,7 +105,7 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_name: str) -> list:
             latest_tag = get_latest_remote_docker_tag(
                 docker_path) if latest_tag == "NA" else latest_tag
             docker_detail.append(
-                f"{docker_path}\t{used_tag}\t{latest_tag}\t{line_num}\t{wdl_name}")
+                f"{docker_path}\t{used_tag}\t{latest_tag}\t{line_num}\t{wdl_path_sum}")
         else:
             pass
 
@@ -101,16 +118,25 @@ def get_latest_remote_docker_tag(docker_path: str) -> str:
     @param docker_path:
     @return:
     """
-    if "gcr" in docker_path:
+    if "gcr" in docker_path or "ghcr" in docker_path:
         latest_tag = get_latest_tag_from_gcr(docker_path)
+        if latest_tag == "NA" or latest_tag == "None":
+            latest_tag = get_gcr_tag_with_gcloud(docker_path)
     elif "quay.io" in docker_path:
-        latest_tag = get_latest_tag_from_quayio(docker_path)
+        latest_tag = get_latest_tag_from_quay(docker_path)
     else:
-        latest_tag = get_latest_tag_from_duckerhub(docker_path)
+        latest_tag = get_latest_tag_from_dockerhub(docker_path)
     return latest_tag
 
 
-def get_latest_tag_from_duckerhub(docker_path: str) -> str:
+def get_latest_tag_from_dockerhub(docker_path: str) -> str:
+
+    """
+    Returns the latest tag of a docker from dockerhub using the dockerhub API
+    @param docker_path:
+    @return:
+    """
+
     image_name = docker_path
     registry_url = f"https://registry.hub.docker.com/v2/repositories/{image_name}/tags/?page_size=1&ordering=last_updated"
     try:
@@ -132,13 +158,20 @@ def get_latest_tag_from_duckerhub(docker_path: str) -> str:
 
 
 def get_latest_tag_from_gcr(docker_path: str) -> str:
+
+    """
+    Returns the latest tag of a docker from GCR using the Container Registry API
+    @param docker_path:
+    @return:
+    """
+
     # Split the image string into project ID and image name
     parts = docker_path.split("/")
     gcr_repo = parts[0]
     project_id = parts[1]
     image_name = "/".join(parts[2:])
     # Construct the URL for retrieving tags
-    registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list"
+    registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list?page_size=1&ordering=last_updated"
 
     try:
         # Send the GET request to the Container Registry API
@@ -146,9 +179,12 @@ def get_latest_tag_from_gcr(docker_path: str) -> str:
             data = response.read().decode("utf-8")
             json_data = json.loads(data)
             tags = json_data.get("tags")
-            if tags:
-                latest_tag = max(tags)
-                return latest_tag
+
+            tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)]
+            if tags_str_removed:
+                if tags_str_removed is not None:
+                    latest_tag = max(tags_str_removed)
+                    return latest_tag
             else:
                 return "NA"
     except urllib.error.HTTPError as e:
@@ -159,7 +195,66 @@ def get_latest_tag_from_gcr(docker_path: str) -> str:
         pass
 
 
-def get_latest_tag_from_quayio(docker_path: str) -> str:
+def get_gcr_tag_with_gcloud(docker_path: str) -> str or None:
+
+    """
+    Returns the latest tag of a docker using gcloud
+    @param docker_path:
+    @return:
+    """
+
+    # Split the image string into project ID and image name
+
+    if is_gcloud_installed():
+
+        command = [
+            "gcloud",
+            "container",
+            "images",
+            "list-tags",
+            docker_path,
+            "--format=get(tags)",
+            "--limit=1",
+            "--sort-by=~timestamp.datetime",
+            "--filter=tags:*",
+        ]
+
+        process = subprocess.run(command, capture_output=True, text=True)
+        if process.returncode == 0:
+            output = process.stdout.strip()
+            if output:
+                latest_tag = output.splitlines()[0]
+                return latest_tag
+
+        # Error handling
+        error_message = process.stderr.strip() if process.stderr else process.stdout.strip()
+        #print(f"Error: {error_message}")
+        return None
+    else:
+        return None
+
+
+def is_gcloud_installed() -> bool:
+    """
+    Checks if gcloud is installed
+    @return:
+    """
+
+    command = ["gcloud", "--version"]
+
+    try:
+        subprocess.run(command, check=True, capture_output=True)
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+def get_latest_tag_from_quay(docker_path: str) -> str:
+    """
+    Returns the latest tag of a docker from quay.io
+    @param docker_path:
+    @return:
+    """
     # Split the image string into project ID and image name
     parts = docker_path.split("/")
     quayio_repo = parts[0]
@@ -174,8 +269,10 @@ def get_latest_tag_from_quayio(docker_path: str) -> str:
             data = response.read().decode("utf-8")
             json_data = json.loads(data)
             tags = json_data.get("tags")
-            if tags:
-                latest_tag = max(tags)
+
+            tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)]
+            if tags_str_removed:
+                latest_tag = max(tags_str_removed)
                 return latest_tag
             else:
                 return "NA"

From 387fe74d6a7da1de9356841d7728aa729128670a Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 19 May 2023 17:56:09 -0400
Subject: [PATCH 03/18] moved collect_docker_in_system.sh to scripts/docker
 folder

---
 scripts/{ => docker}/collect_docker_in_system.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{ => docker}/collect_docker_in_system.sh (100%)

diff --git a/scripts/collect_docker_in_system.sh b/scripts/docker/collect_docker_in_system.sh
similarity index 100%
rename from scripts/collect_docker_in_system.sh
rename to scripts/docker/collect_docker_in_system.sh

From a9e24fcfd02c17e7e6406a8965a98a30679d4750 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Fri, 19 May 2023 18:04:47 -0400
Subject: [PATCH 04/18] adding notes to docker_usage_sum.py

---
 scripts/docker/docker_usage_sum.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index e9ce2842b..edc7c1272 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -9,11 +9,15 @@
 # A script to collect which dockers are in use and which latest dockers are available
 # Usage: python3 docker_usage_sum.py
 # Output: dockers.in_use.tsv
-# Note: This script is not perfect. It will not be able to detect dockers that are
+# Note: - This script is not perfect. It will not be able to detect dockers that are
 #       imported from other wdl files. It will only detect dockers that are
 #       explicitly defined in the wdl file.
-#       The script assumes it is executed from the scripts/docker directory, and the
+#       - The script assumes it is executed from the scripts/docker directory, and the
 #       wdl files are in ../../wdl directory.
+#       - The script will retrieve the "latest" tag by date, so if an unofficial tag was
+#       created, after the official tag was created, the script will retrieve the
+#       unofficial tag as the latest tag. (It tries to avoid this by filtering out
+#       tags with no digits).
 
 def main():
     current_dir = os.path.abspath(os.path.dirname(__file__))

From d027cf0cd5d71273f6e0a991c2a187750fd22876 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 22 Aug 2023 11:09:45 -0400
Subject: [PATCH 05/18] Refactoring, switched tsv columns, logic to look for
 latest tag for gcr

---
 scripts/docker/docker_usage_sum.py | 130 ++++++++++++++++++-----------
 1 file changed, 79 insertions(+), 51 deletions(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index edc7c1272..073b0eec0 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -4,6 +4,9 @@
 import urllib.request
 from urllib.error import HTTPError, URLError
 import json
+import logging
+
+logging.basicConfig(level=logging.INFO)
 
 
 # A script to collect which dockers are in use and which latest dockers are available
@@ -19,17 +22,19 @@
 #       unofficial tag as the latest tag. (It tries to avoid this by filtering out
 #       tags with no digits).
 
+# TODO: Future suggestion: have the results be generated for main branch for each merge
+
 def main():
     current_dir = os.path.abspath(os.path.dirname(__file__))
 
     print("COLLECTING DOCKERS IN USE...")
-    wdls_dir = os.path.abspath(os.path.join(current_dir, "../../wdl"))
-    sum_tsv_file = os.path.join(current_dir, "dockers.in_use.tsv")
+    WDLS_DIR = os.path.abspath(os.path.join(current_dir, "../../wdl"))
+    OUT_SUMMARY_TSV = os.path.join(current_dir, "dockers.in_use.tsv")
 
-    if os.path.exists(sum_tsv_file):
-        os.remove(sum_tsv_file)
+    if os.path.exists(OUT_SUMMARY_TSV):
+        os.remove(OUT_SUMMARY_TSV)
 
-    wdl_files = get_wdl_files(dir_to_wdls=wdls_dir)
+    wdl_files = get_wdl_files(dir_to_wdls=WDLS_DIR)
     global_docker_info = []
 
     total_files = len(wdl_files)  # Used for Progression calculation
@@ -38,13 +43,13 @@ def main():
 
         wdl_name = wdl_path
 
-        with open(wdl_path, "r") as file:
-            content = file.read()
-            pattern = re.compile(r'.*docker.*"')
-            if pattern.search(content):
+        with open(wdl_path, "r") as file_content:
+            content = file_content.read()
+            pattern = re.compile(r'.*docker:.*"')
+            if pattern.search(content): # If wdl file contains "docker:"
                 matched_lines = []
-                file.seek(0)
-                lines = file.readlines()
+                file_content.seek(0)
+                lines = file_content.readlines()
 
                 for line_number, line in enumerate(lines, start=1):
                     if pattern.search(line):
@@ -57,20 +62,23 @@ def main():
                 sorted_info: list = sorted(docker_info, reverse=False)
 
                 global_docker_info.append(sorted_info)
+            else:
+                pass
 
         # Progression
         # Calculate the percentage completion
-        progress = (index + 1) / total_files * 100
+        progress: float = (index + 1) / total_files * 100
 
         # Clear the previous line and print the progress
         print(f"Progress: {progress:.2f}%\r", end="")
+    with open(OUT_SUMMARY_TSV, "a") as tsv_file:
+        # Add header
+        tsv_file.write(f"DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH")
+        # Add content
+        for docker_info_line in sorted(global_docker_info):
+            tsv_file.write("\n".join(docker_info_line) + "\n")
 
-    with open(sum_tsv_file, "a") as tsv_file:
-        tsv_file.write(f"DOCKER_NAME\tUSED_TAG\tLATEST_TAG\tFILE_LINE\tWDL_PATH\n")
-        for line in sorted(global_docker_info):
-            tsv_file.write("\n".join(line) + "\n")
-
-    print(f"DONE. PLEASE CHECKOUT TSV FILE: {sum_tsv_file}")
+    print(f"DONE. PLEASE CHECKOUT TSV FILE: {OUT_SUMMARY_TSV}")
 
 
 def get_wdl_files(dir_to_wdls: str) -> list:
@@ -91,25 +99,27 @@ def get_wdl_files(dir_to_wdls: str) -> list:
 def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list:
     """
     Returns a list of docker info
-    @param wdl_path:
+    @param wdl_path: path to wdl file
     @param wdl_lines: (line_number, line_content)
-    @return:
+    @return: list of docker info e.g. [" docker_name\tlatest_tag\tused_tag\tline_num\twdl_path, ..."]
     """
     docker_detail = []
 
+    # Get the path after /wdl/ for better readability
     wdl_path_sum = wdl_path[wdl_path.find("/wdl/"):]
 
     for line_num, line_content in wdl_lines:
-        docker_names = re.findall(r'docker.*"(\S*?)"', line_content)
+        docker_names = re.findall(r'docker:\s*"(\S*?)"', line_content) #
         if docker_names:
-            docker_name = docker_names[0]
-            used_tag = os.path.basename(docker_name).split(":")[1]
-            docker_path = docker_name.split(":")[0]
-            latest_tag = get_latest_local_docker_tag(docker_path)
+            docker_name_and_version = docker_names[0]
+            used_tag = os.path.basename(docker_name_and_version).split(":")[1]
+            docker_name = docker_name_and_version.split(":")[0]
+            latest_tag = get_latest_local_docker_tag(docker_name)
+            # If the latest tag is not found locally, try to get it from remote
             latest_tag = get_latest_remote_docker_tag(
-                docker_path) if latest_tag == "NA" else latest_tag
+                docker_name) if latest_tag == "NA" else latest_tag
             docker_detail.append(
-                f"{docker_path}\t{used_tag}\t{latest_tag}\t{line_num}\t{wdl_path_sum}")
+                f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}")
         else:
             pass
 
@@ -118,13 +128,13 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list:
 
 def get_latest_remote_docker_tag(docker_path: str) -> str:
     """
-    Returns the latest tag of a docker
+    Returns the latest tag of a docker from gcr, quay or dockerhub
     @param docker_path:
     @return:
     """
     if "gcr" in docker_path or "ghcr" in docker_path:
         latest_tag = get_latest_tag_from_gcr(docker_path)
-        if latest_tag == "NA" or latest_tag == "None":
+        if latest_tag == "NA" or latest_tag == "None" or latest_tag == "latest" or latest_tag is None:
             latest_tag = get_gcr_tag_with_gcloud(docker_path)
     elif "quay.io" in docker_path:
         latest_tag = get_latest_tag_from_quay(docker_path)
@@ -170,12 +180,15 @@ def get_latest_tag_from_gcr(docker_path: str) -> str:
     """
 
     # Split the image string into project ID and image name
+    # us.gcr.io/broad-dsp-lrma/lr-transcript_utils:latest
     parts = docker_path.split("/")
-    gcr_repo = parts[0]
-    project_id = parts[1]
-    image_name = "/".join(parts[2:])
+    gcr_repo = parts[0]  # Example: us.gcr.io
+    project_id = parts[1]  # Example: broad-dsp-lrma
+    image_name = "/".join(parts[2:])  # Example: lr-transcript_utils
+
     # Construct the URL for retrieving tags
-    registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list?page_size=1&ordering=last_updated"
+    # https://cloud.google.com/artifact-registry/docs/reference/docker-api
+    registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list"
 
     try:
         # Send the GET request to the Container Registry API
@@ -183,7 +196,19 @@ def get_latest_tag_from_gcr(docker_path: str) -> str:
             data = response.read().decode("utf-8")
             json_data = json.loads(data)
             tags = json_data.get("tags")
-
+            manifest = json_data.get("manifest")
+
+            # The manifest is a list of dicts for each version of an image, each dict
+            # has a key called "tag", which is a list of tags for that version.
+            # The image version having "latest" as part of its tag is what we want.
+            for sha_key in manifest:
+                sha_key_tags = manifest[sha_key].get("tag")
+                if "latest" in sha_key_tags:
+                    latest_tag = sha_key_tags[0]
+                    return latest_tag if latest_tag else "NA"
+
+            # If the image doesn't have a "latest" tag, return the tag with the
+            # highest version number.
             tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)]
             if tags_str_removed:
                 if tags_str_removed is not None:
@@ -217,25 +242,28 @@ def get_gcr_tag_with_gcloud(docker_path: str) -> str or None:
             "images",
             "list-tags",
             docker_path,
-            "--format=get(tags)",
+            "--format=json",
             "--limit=1",
             "--sort-by=~timestamp.datetime",
-            "--filter=tags:*",
+            "--filter=tags:latest",
         ]
 
-        process = subprocess.run(command, capture_output=True, text=True)
-        if process.returncode == 0:
-            output = process.stdout.strip()
-            if output:
-                latest_tag = output.splitlines()[0]
-                return latest_tag
+        gc_container_results = subprocess.run(command, capture_output=True, text=True)
+        if gc_container_results and gc_container_results.returncode == 0:
+            gc_container_results_json = json.loads(gc_container_results.stdout)
+            try :
+                latest_tag = gc_container_results_json[0].get("tags")[0]
+                return latest_tag if latest_tag is not None else "NA"
+            except IndexError:
 
-        # Error handling
-        error_message = process.stderr.strip() if process.stderr else process.stdout.strip()
-        #print(f"Error: {error_message}")
-        return None
+                logging.warning(f"Gcloud Container obtain empty tag for : {gc_container_results_json} - {docker_path}")
+                return "NA"
+        else:
+            # Error handling
+            error_message = gc_container_results.stderr.strip() if gc_container_results.stderr else gc_container_results.stdout.strip()
+            #print(f"Error: {error_message}")
     else:
-        return None
+        return "NA"
 
 
 def is_gcloud_installed() -> bool:
@@ -288,13 +316,13 @@ def get_latest_tag_from_quay(docker_path: str) -> str:
         pass
 
 
-def get_latest_local_docker_tag(docker_path: str) -> str:
+def get_latest_local_docker_tag(docker_name: str) -> str:
     """
-    Returns the latest tag of a docker
-    @param docker_path:
+    Returns the latest tag of a docker from the local docker directory
+    @param docker_name: name of the docker e.g. "gatk"
     @return:
     """
-    docker_name = os.path.basename(docker_path)
+    docker_name = os.path.basename(docker_name)
     docker_dir = "../docker"
     latest_tag = "NA"
 

From 4995a573c8af64633245e885f3d8edbc5738b76c Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Tue, 22 Aug 2023 15:55:32 -0400
Subject: [PATCH 06/18] Refactoring, divide up some functions

---
 scripts/docker/docker_usage_sum.py | 104 +++++++++++++++++------------
 1 file changed, 62 insertions(+), 42 deletions(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index 073b0eec0..21a9b4038 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -109,23 +109,39 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list:
     wdl_path_sum = wdl_path[wdl_path.find("/wdl/"):]
 
     for line_num, line_content in wdl_lines:
-        docker_names = re.findall(r'docker:\s*"(\S*?)"', line_content) #
-        if docker_names:
-            docker_name_and_version = docker_names[0]
+        docker_image = re.findall(r'docker:\s*"(\S*?)"', line_content)
+        if docker_image:
+            docker_name_and_version = docker_image[0]
             used_tag = os.path.basename(docker_name_and_version).split(":")[1]
             docker_name = docker_name_and_version.split(":")[0]
-            latest_tag = get_latest_local_docker_tag(docker_name)
+
+            # Get latest tag from list of docker details if it was already retrieved
+            latest_tag = get_tag_from_docker_details(docker_detail=docker_detail, docker_name=docker_name)
+            # Get latest tag from local docker if it was not retrieved from list of docker details
+            latest_tag = get_latest_local_docker_tag(docker_name) if latest_tag == "NA" else latest_tag
             # If the latest tag is not found locally, try to get it from remote
-            latest_tag = get_latest_remote_docker_tag(
-                docker_name) if latest_tag == "NA" else latest_tag
-            docker_detail.append(
-                f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}")
+            latest_tag = get_latest_remote_docker_tag(docker_name) if latest_tag == "NA" else latest_tag
+            docker_detail.append(f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}")
         else:
             pass
 
     return docker_detail
 
 
+def get_tag_from_docker_details(docker_detail: list, docker_name: str) -> str:
+    """
+    Returns the latest tag of a docker from a list of docker details
+    @param docker_detail: list of docker details e.g. ["docker_name\tlatest_tag\tused_tag\tline_num\twdl_path, ..."]
+    @param docker_name: docker name
+    @return: latest tag
+    """
+    latest_tag = "NA"
+    for docker_info in docker_detail:
+        if docker_name in docker_info:
+            latest_tag = docker_info.split("\t")[1]
+            break
+    return latest_tag
+
 def get_latest_remote_docker_tag(docker_path: str) -> str:
     """
     Returns the latest tag of a docker from gcr, quay or dockerhub
@@ -134,7 +150,7 @@ def get_latest_remote_docker_tag(docker_path: str) -> str:
     """
     if "gcr" in docker_path or "ghcr" in docker_path:
         latest_tag = get_latest_tag_from_gcr(docker_path)
-        if latest_tag == "NA" or latest_tag == "None" or latest_tag == "latest" or latest_tag is None:
+        if latest_tag in ["NA", "None", "latest"] or latest_tag is None:
             latest_tag = get_gcr_tag_with_gcloud(docker_path)
     elif "quay.io" in docker_path:
         latest_tag = get_latest_tag_from_quay(docker_path)
@@ -193,29 +209,7 @@ def get_latest_tag_from_gcr(docker_path: str) -> str:
     try:
         # Send the GET request to the Container Registry API
         with urllib.request.urlopen(registry_url) as response:
-            data = response.read().decode("utf-8")
-            json_data = json.loads(data)
-            tags = json_data.get("tags")
-            manifest = json_data.get("manifest")
-
-            # The manifest is a list of dicts for each version of an image, each dict
-            # has a key called "tag", which is a list of tags for that version.
-            # The image version having "latest" as part of its tag is what we want.
-            for sha_key in manifest:
-                sha_key_tags = manifest[sha_key].get("tag")
-                if "latest" in sha_key_tags:
-                    latest_tag = sha_key_tags[0]
-                    return latest_tag if latest_tag else "NA"
-
-            # If the image doesn't have a "latest" tag, return the tag with the
-            # highest version number.
-            tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)]
-            if tags_str_removed:
-                if tags_str_removed is not None:
-                    latest_tag = max(tags_str_removed)
-                    return latest_tag
-            else:
-                return "NA"
+            return extract_latest_tag_from_registry_response(response)
     except urllib.error.HTTPError as e:
         # print(f"Error: {e.code} - {e.reason}")
         pass
@@ -223,6 +217,41 @@ def get_latest_tag_from_gcr(docker_path: str) -> str:
         # print(f"Error: Failed to reach the server - {e.reason}")
         pass
 
+def extract_latest_tag_from_registry_response(response) -> str:
+    """
+    Extracts the latest tag from the response of a registry API call
+    @param response:
+    @return:
+    """
+
+    response_content = response.read().decode("utf-8")
+    latest_tag = "NA"
+
+    if response_content:
+        json_data = json.loads(response_content)
+        tags = json_data.get("tags")
+        manifest = json_data.get("manifest")
+
+        # The manifest is a dict of dicts for each version of an image, each dict
+        # has a key called "tag", which is a list of tags for that version.
+        # The image version having "latest" as part of its tag is returned.
+        if manifest:
+            for sha_key in manifest:
+                sha_key_tags = manifest[sha_key].get("tag")
+                if "latest" in sha_key_tags:
+                    latest_tag = sha_key_tags[0]
+                    return latest_tag
+
+        # If the image doesn't have a "latest" tag, return the tag with the
+        # highest version number.
+        tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)]
+        if tags_str_removed:
+            latest_tag = max(tags_str_removed)
+            return latest_tag
+        else:
+            return latest_tag # If no numerical version tags are found, return NA
+    else:
+        return latest_tag # If manifest is empty, return NA
 
 def get_gcr_tag_with_gcloud(docker_path: str) -> str or None:
 
@@ -298,16 +327,7 @@ def get_latest_tag_from_quay(docker_path: str) -> str:
     try:
         # Send the GET request to the Container Registry API
         with urllib.request.urlopen(registry_url) as response:
-            data = response.read().decode("utf-8")
-            json_data = json.loads(data)
-            tags = json_data.get("tags")
-
-            tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)]
-            if tags_str_removed:
-                latest_tag = max(tags_str_removed)
-                return latest_tag
-            else:
-                return "NA"
+            return extract_latest_tag_from_registry_response(response)
     except urllib.error.HTTPError as e:
         # print(f"Error: {e.code} - {e.reason}")
         pass

From e0d745b8ba94bde7bcdc7cde366d77801d453b09 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 23 Aug 2023 09:46:39 -0400
Subject: [PATCH 07/18] Note about having gcloud installed

---
 scripts/docker/docker_usage_sum.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index 21a9b4038..24088cc20 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -21,6 +21,8 @@
 #       created, after the official tag was created, the script will retrieve the
 #       unofficial tag as the latest tag. (It tries to avoid this by filtering out
 #       tags with no digits).
+#      - The script occasionally uses gcloud to retrieve the latest tag. Its suggested
+#      to have gcloud installed.
 
 # TODO: Future suggestion: have the results be generated for main branch for each merge
 

From b4eb61c78210a840842d80e0fe03990e83d45e07 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 25 Oct 2023 15:27:44 -0400
Subject: [PATCH 08/18] rm collect_docker_in_system.sh

---
 scripts/docker/collect_docker_in_system.sh | 40 ----------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 scripts/docker/collect_docker_in_system.sh

diff --git a/scripts/docker/collect_docker_in_system.sh b/scripts/docker/collect_docker_in_system.sh
deleted file mode 100644
index c390eb5eb..000000000
--- a/scripts/docker/collect_docker_in_system.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-################################################################################
-# A script to collect which dockers are in use and which latest dockers available
-################################################################################
-
-dir=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
-cd "${dir}"
-
-
-echo "COLLECTING DOCKERS IN USE..."
-cd ../wdl
-rm -f dockers.in_use.tsv
-for wdl in $(find . -name "*.wdl"| sed "s|^\./||") ; do
-    if ! grep -qE 'docker:\s+\"' "${wdl}"; then continue; fi;
-    grep -nE 'docker:\s+\"' "${wdl}" > tmp.0.txt
-    awk -F ':' '{print $3"\t"$4"\t"$1}' tmp.0.txt | sed -e 's/^[[:space:]]*//' | sed "s/\"//g" | awk -F '/' '{print $NF}' | sort > tmp.1.txt
-    sed -e "s%$%\t$wdl%" tmp.1.txt >> dockers.in_use.tsv
-    rm tmp.*.txt
-done
-echo -e "name\ttag\tline\twdl\n$(sort dockers.in_use.tsv)" > dockers.in_use.sorted.tsv
-rm dockers.in_use.tsv
-
-echo "COLLECTING LATEST DOCKERS AVAILABLE..."
-cd ../docker
-rm -f dockers.latest.tsv
-for makefile in $(find . -mindepth 2 -name "Makefile" | sed "s|^\./||") ; do
-    name=$(grep -m 1 -F 'TAG1' "${makefile}" | awk -F '/' '{print $NF}' | awk -F ':' '{print $1}')
-    tag=$(head -n 1 "${makefile}" | awk -F '=' '{print $NF}' | sed 's% %%g' | awk -F '#' '{print $1}')
-    echo -e "${name}\t${tag}" >> dockers.latest.tsv
-done
-sort dockers.latest.tsv > dockers.latest.sorted.tsv
-rm dockers.latest.tsv
-
-echo "DONE. PLEASE CHECKOUT TWO TSV FILES: [dockers.in_use.sorted.tsv, dockers.latest.sorted.tsv]"
-cd "${dir}"
-mv ../wdl/dockers.in_use.sorted.tsv .
-mv ../docker/dockers.latest.sorted.tsv .

From 0a9bdac4bae8971347b9838e5354dcd1afbc8465 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 25 Oct 2023 16:02:49 -0400
Subject: [PATCH 09/18] Added help message

---
 scripts/docker/docker_usage_sum.py | 73 ++++++++++++++++++------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index 24088cc20..b65b349a3 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import re
 import subprocess
@@ -9,24 +10,32 @@
 logging.basicConfig(level=logging.INFO)
 
 
-# A script to collect which dockers are in use and which latest dockers are available
-# Usage: python3 docker_usage_sum.py
-# Output: dockers.in_use.tsv
-# Note: - This script is not perfect. It will not be able to detect dockers that are
-#       imported from other wdl files. It will only detect dockers that are
-#       explicitly defined in the wdl file.
-#       - The script assumes it is executed from the scripts/docker directory, and the
-#       wdl files are in ../../wdl directory.
-#       - The script will retrieve the "latest" tag by date, so if an unofficial tag was
-#       created, after the official tag was created, the script will retrieve the
-#       unofficial tag as the latest tag. (It tries to avoid this by filtering out
-#       tags with no digits).
-#      - The script occasionally uses gcloud to retrieve the latest tag. Its suggested
-#      to have gcloud installed.
-
 # TODO: Future suggestion: have the results be generated for main branch for each merge
 
 def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description='''
+Collects docker usage summary from wdl files.
+
+Output: 
+    dockers.in_use.tsv
+Notes: 
+    - This script is not perfect. It will not be able to detect dockers that are
+    imported from other wdl files. It will only detect dockers that are
+    explicitly defined in the wdl file.
+    - The script assumes it is executed from the scripts/docker directory, and the
+    wdl files are in ../../wdl directory.
+    - The script will retrieve the "latest" tag by date, so if an unofficial tag was
+    created, after the official tag was created, the script will retrieve the
+    unofficial tag as the latest tag. (It tries to avoid this by filtering out
+    tags with no digits).
+    - The script occasionally uses gcloud to retrieve the latest tag. Its suggested
+    to have gcloud installed.
+        ''',
+    )
+    parser.parse_args()
+
     current_dir = os.path.abspath(os.path.dirname(__file__))
 
     print("COLLECTING DOCKERS IN USE...")
@@ -48,7 +57,7 @@ def main():
         with open(wdl_path, "r") as file_content:
             content = file_content.read()
             pattern = re.compile(r'.*docker:.*"')
-            if pattern.search(content): # If wdl file contains "docker:"
+            if pattern.search(content):  # If wdl file contains "docker:"
                 matched_lines = []
                 file_content.seek(0)
                 lines = file_content.readlines()
@@ -118,12 +127,16 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list:
             docker_name = docker_name_and_version.split(":")[0]
 
             # Get latest tag from list of docker details if it was already retrieved
-            latest_tag = get_tag_from_docker_details(docker_detail=docker_detail, docker_name=docker_name)
+            latest_tag = get_tag_from_docker_details(docker_detail=docker_detail,
+                                                     docker_name=docker_name)
             # Get latest tag from local docker if it was not retrieved from list of docker details
-            latest_tag = get_latest_local_docker_tag(docker_name) if latest_tag == "NA" else latest_tag
+            latest_tag = get_latest_local_docker_tag(
+                docker_name) if latest_tag == "NA" else latest_tag
             # If the latest tag is not found locally, try to get it from remote
-            latest_tag = get_latest_remote_docker_tag(docker_name) if latest_tag == "NA" else latest_tag
-            docker_detail.append(f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}")
+            latest_tag = get_latest_remote_docker_tag(
+                docker_name) if latest_tag == "NA" else latest_tag
+            docker_detail.append(
+                f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}")
         else:
             pass
 
@@ -144,6 +157,7 @@ def get_tag_from_docker_details(docker_detail: list, docker_name: str) -> str:
             break
     return latest_tag
 
+
 def get_latest_remote_docker_tag(docker_path: str) -> str:
     """
     Returns the latest tag of a docker from gcr, quay or dockerhub
@@ -162,7 +176,6 @@ def get_latest_remote_docker_tag(docker_path: str) -> str:
 
 
 def get_latest_tag_from_dockerhub(docker_path: str) -> str:
-
     """
     Returns the latest tag of a docker from dockerhub using the dockerhub API
     @param docker_path:
@@ -190,7 +203,6 @@ def get_latest_tag_from_dockerhub(docker_path: str) -> str:
 
 
 def get_latest_tag_from_gcr(docker_path: str) -> str:
-
     """
     Returns the latest tag of a docker from GCR using the Container Registry API
     @param docker_path:
@@ -219,6 +231,7 @@ def get_latest_tag_from_gcr(docker_path: str) -> str:
         # print(f"Error: Failed to reach the server - {e.reason}")
         pass
 
+
 def extract_latest_tag_from_registry_response(response) -> str:
     """
     Extracts the latest tag from the response of a registry API call
@@ -246,17 +259,18 @@ def extract_latest_tag_from_registry_response(response) -> str:
 
         # If the image doesn't have a "latest" tag, return the tag with the
         # highest version number.
-        tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)]
+        tags_str_removed = [item for item in tags if
+                            any(char.isdigit() for char in item)]
         if tags_str_removed:
             latest_tag = max(tags_str_removed)
             return latest_tag
         else:
-            return latest_tag # If no numerical version tags are found, return NA
+            return latest_tag  # If no numerical version tags are found, return NA
     else:
-        return latest_tag # If manifest is empty, return NA
+        return latest_tag  # If manifest is empty, return NA
 
-def get_gcr_tag_with_gcloud(docker_path: str) -> str or None:
 
+def get_gcr_tag_with_gcloud(docker_path: str) -> str or None:
     """
     Returns the latest tag of a docker using gcloud
     @param docker_path:
@@ -282,17 +296,18 @@ def get_gcr_tag_with_gcloud(docker_path: str) -> str or None:
         gc_container_results = subprocess.run(command, capture_output=True, text=True)
         if gc_container_results and gc_container_results.returncode == 0:
             gc_container_results_json = json.loads(gc_container_results.stdout)
-            try :
+            try:
                 latest_tag = gc_container_results_json[0].get("tags")[0]
                 return latest_tag if latest_tag is not None else "NA"
             except IndexError:
 
-                logging.warning(f"Gcloud Container obtain empty tag for : {gc_container_results_json} - {docker_path}")
+                logging.warning(
+                    f"Gcloud Container obtain empty tag for : {gc_container_results_json} - {docker_path}")
                 return "NA"
         else:
             # Error handling
             error_message = gc_container_results.stderr.strip() if gc_container_results.stderr else gc_container_results.stdout.strip()
-            #print(f"Error: {error_message}")
+            # print(f"Error: {error_message}")
     else:
         return "NA"
 

From 76a86eae4bf62c4052381a10430ee42c9e53180b Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 25 Oct 2023 16:09:25 -0400
Subject: [PATCH 10/18] make backup of old tsv instead of deleting it

---
 scripts/docker/docker_usage_sum.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index b65b349a3..6266be2b0 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -43,7 +43,7 @@ def main():
     OUT_SUMMARY_TSV = os.path.join(current_dir, "dockers.in_use.tsv")
 
     if os.path.exists(OUT_SUMMARY_TSV):
-        os.remove(OUT_SUMMARY_TSV)
+        os.rename(OUT_SUMMARY_TSV, OUT_SUMMARY_TSV + ".bak")
 
     wdl_files = get_wdl_files(dir_to_wdls=WDLS_DIR)
     global_docker_info = []

From c94e6801cc45ac684565d5e66aa238608b974611 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 25 Oct 2023 16:45:58 -0400
Subject: [PATCH 11/18] updated regex pattern

---
 scripts/docker/docker_usage_sum.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index 6266be2b0..4f380ff78 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -56,7 +56,7 @@ def main():
 
         with open(wdl_path, "r") as file_content:
             content = file_content.read()
-            pattern = re.compile(r'.*docker:.*"')
+            pattern = re.compile(r'\s*docker:\s*"')
             if pattern.search(content):  # If wdl file contains "docker:"
                 matched_lines = []
                 file_content.seek(0)

From 17043ef94b15110d861d90a3971b9c36bff065f2 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 25 Oct 2023 17:03:24 -0400
Subject: [PATCH 12/18] changed way of writing to tsv file

---
 scripts/docker/docker_usage_sum.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index 4f380ff78..6f3022167 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -82,12 +82,13 @@ def main():
 
         # Clear the previous line and print the progress
         print(f"Progress: {progress:.2f}%\r", end="")
-    with open(OUT_SUMMARY_TSV, "a") as tsv_file:
+    with open(OUT_SUMMARY_TSV, "w") as tsv_file:
         # Add header
-        tsv_file.write(f"DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH")
+        tsv_file.write("DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH")
         # Add content
         for docker_info_line in sorted(global_docker_info):
-            tsv_file.write("\n".join(docker_info_line) + "\n")
+            delimiter = "\n"
+            tsv_file.write(delimiter.join(docker_info_line) + "\n")
 
     print(f"DONE. PLEASE CHECKOUT TSV FILE: {OUT_SUMMARY_TSV}")
 
@@ -137,8 +138,6 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list:
                 docker_name) if latest_tag == "NA" else latest_tag
             docker_detail.append(
                 f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}")
-        else:
-            pass
 
     return docker_detail
 

From 77dc9ce8084f42e7fafde82cafccf668fe7b712d Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 26 Oct 2023 14:11:04 -0400
Subject: [PATCH 13/18] Added py script to create markdown file from docker
 usage tsv

---
 scripts/docker/docker_usage_sum.py  |  2 +-
 scripts/git_page/docker_usage_md.py | 87 +++++++++++++++++++++++++++++
 scripts/git_page/utility.py         | 34 +++++++++++
 3 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 scripts/git_page/docker_usage_md.py

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index 6f3022167..e44b4db3a 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -40,7 +40,7 @@ def main():
 
     print("COLLECTING DOCKERS IN USE...")
     WDLS_DIR = os.path.abspath(os.path.join(current_dir, "../../wdl"))
-    OUT_SUMMARY_TSV = os.path.join(current_dir, "dockers.in_use.tsv")
+    OUT_SUMMARY_TSV = os.path.join(current_dir, "dockers_in_use.tsv")
 
     if os.path.exists(OUT_SUMMARY_TSV):
         os.rename(OUT_SUMMARY_TSV, OUT_SUMMARY_TSV + ".bak")
diff --git a/scripts/git_page/docker_usage_md.py b/scripts/git_page/docker_usage_md.py
new file mode 100644
index 000000000..e455df5a0
--- /dev/null
+++ b/scripts/git_page/docker_usage_md.py
@@ -0,0 +1,87 @@
+import argparse
+import os
+import logging
+import utility as util
+
+# Get the current working directory
+cwd = os.getcwd()
+
+# Get the path to the scripts directory
+scripts_dir = os.path.join(cwd, "scripts")
+
+# Get the path to the git_page directory
+git_page_dir = os.path.join(scripts_dir, "git_page")
+
+Logger = logging.getLogger(__name__)
+logging.basicConfig()
+
+
+def main():
+    # read command-line arguments
+    parser = argparse.ArgumentParser(
+        description="Generate docker_usage.md file by running the docker_usage_sum.py and using the output to generate the markdown file."
+    )
+    parser.add_argument("--output_path", help="Path to the out markdown file")
+    parser.add_argument("--debug", action="store_true", help="verbose logging")
+
+    args = parser.parse_args()
+    util.set_logging_level(args)
+
+    current_dir = os.path.abspath(os.path.dirname(__file__))
+    parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
+    docker_script_dir = os.path.join(parent_dir, "docker")
+    OUT_FILE_BASENAME = "Dockers_In_Use"
+    OUT_SUMMARY_TSV = os.path.join(docker_script_dir, OUT_FILE_BASENAME + ".tsv")
+
+    if args.output_path:
+        markdown_file = os.path.join(args.output_path, OUT_FILE_BASENAME + ".md")
+    else:
+        markdown_file = os.path.join(current_dir, OUT_FILE_BASENAME + ".md")
+
+    resolved_markdown_file = os.path.abspath(markdown_file)
+
+    Logger.debug(f"markdown_file: {resolved_markdown_file}")
+
+    # Run docker_usage_sum.py
+    docker_usage_sum_cmd = ["python3",
+                            os.path.join(docker_script_dir, "docker_usage_sum.py")]
+    util.run_command(docker_usage_sum_cmd)
+
+    markdown_table = tsv_to_markdown(tsv_file=OUT_SUMMARY_TSV)
+    write_docker_usage_to_markdown(
+        resolved_markdown_file=resolved_markdown_file, markdown_table=markdown_table
+    )
+
+
+def tsv_to_markdown(tsv_file: str) -> str:
+    """
+    Converts a tsv file to markdown table format
+    @param tsv_file:
+    @return:
+    """
+    with open(tsv_file, "r") as f:
+        lines = f.readlines()
+
+    markdown_table = "|".join(lines[0].split("\t"))
+    markdown_table += "|".join(["---"] * len(lines[0].split("\t")))
+    markdown_table += "\n"
+    for line in lines[1:]:
+        markdown_table += "|".join(line.split("\t"))
+
+    return markdown_table
+
+
+def write_docker_usage_to_markdown(resolved_markdown_file, markdown_table):
+    with open(resolved_markdown_file, "w") as md_file:
+        md_file.write("# Docker Usage\n\n")
+        md_file.write("The following table lists the docker images used in the "
+                      "workflows in this repository.\n\n")
+        md_file.write("The table is generated by running the [docker_usage_sum.py]("
+                      "../docker/docker_usage_sum.py) script.\n\n")
+        md_file.write("The script is run by the [generate_docker_usage_md.py]("
+                      "../git_page/generate_docker_usage_md.py) script.\n\n")
+        md_file.write(markdown_table)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/git_page/utility.py b/scripts/git_page/utility.py
index 82061afb1..d92b4fe12 100644
--- a/scripts/git_page/utility.py
+++ b/scripts/git_page/utility.py
@@ -1,5 +1,6 @@
 import glob
 import logging
+import subprocess
 from pathlib import Path, PurePosixPath, PurePath
 
 Logger = logging.getLogger(__name__)
@@ -62,3 +63,36 @@ def get_all_files_with_extension(directory: Path, ext: str) -> list:
 
     Logger.debug(f'Getting all files with extension {ext} in {directory}...')
     return glob.glob(f'{directory}/**/*.{ext}', recursive=True)
+
+
+def run_command(command: list, log_output=True, ) -> None:
+    """
+    Run a shell command and wait for it to complete.
+
+    :param command: A list representing the shell command to execute.
+    :param log_output: Whether to log the command's output (default is True).
+    :return: None
+
+    This function runs the specified shell command and waits for it to complete.
+    It logs the command before execution and raises an exception if the command fails.
+    """
+    cmd_str = ' '.join(command)  # Convert the command list to a string for logging
+    logging.debug(f'Running command: {cmd_str}...')
+
+    try:
+        result = subprocess.run(command, check=True, shell=False,
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                text=True, encoding='utf-8')
+
+        if log_output:
+            if result.stdout:
+                logging.debug(f'Command output (stdout):\n{result.stdout}')
+            if result.stderr:
+                logging.debug(f'Command output (stderr):\n{result.stderr}')
+    except subprocess.CalledProcessError as e:
+        logging.error(f'Command failed with error: {e}')
+        raise
+    except Exception as e:
+        logging.error(f'An unexpected error occurred: {e}')
+        raise
+

From b9fed9c02caa61c83a8e12f1de762f648bccdd78 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 26 Oct 2023 14:26:25 -0400
Subject: [PATCH 14/18] edited docker_usage_sum.py so that it can be executed
 from any dir

---
 scripts/docker/docker_usage_sum.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index e44b4db3a..557ac0f47 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -37,6 +37,7 @@ def main():
     parser.parse_args()
 
     current_dir = os.path.abspath(os.path.dirname(__file__))
+    repo_dir = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir))
 
     print("COLLECTING DOCKERS IN USE...")
     WDLS_DIR = os.path.abspath(os.path.join(current_dir, "../../wdl"))
@@ -67,7 +68,7 @@ def main():
                         matched_lines.append((line_number, line.strip()))
 
                 docker_info: list[str] = get_docker_info_from_string(
-                    wdl_lines=matched_lines, wdl_path=wdl_name
+                    wdl_lines=matched_lines, wdl_path=wdl_name, repo_dir=repo_dir
                 )
 
                 sorted_info: list = sorted(docker_info, reverse=False)
@@ -108,7 +109,7 @@ def get_wdl_files(dir_to_wdls: str) -> list:
     return wdl_files
 
 
-def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list:
+def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str, repo_dir: str) -> list:
     """
     Returns a list of docker info
     @param wdl_path: path to wdl file
@@ -128,11 +129,14 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list:
             docker_name = docker_name_and_version.split(":")[0]
 
             # Get latest tag from list of docker details if it was already retrieved
-            latest_tag = get_tag_from_docker_details(docker_detail=docker_detail,
-                                                     docker_name=docker_name)
+            latest_tag = get_tag_from_docker_details(
+                docker_detail=docker_detail, docker_name=docker_name
+            )
             # Get latest tag from local docker if it was not retrieved from list of docker details
             latest_tag = get_latest_local_docker_tag(
-                docker_name) if latest_tag == "NA" else latest_tag
+                docker_name=docker_name,  repo_dir=repo_dir
+            ) if latest_tag == "NA" else latest_tag
+
             # If the latest tag is not found locally, try to get it from remote
             latest_tag = get_latest_remote_docker_tag(
                 docker_name) if latest_tag == "NA" else latest_tag
@@ -352,14 +356,14 @@ def get_latest_tag_from_quay(docker_path: str) -> str:
         pass
 
 
-def get_latest_local_docker_tag(docker_name: str) -> str:
+def get_latest_local_docker_tag(docker_name: str, repo_dir: str) -> str:
     """
     Returns the latest tag of a docker from the local docker directory
     @param docker_name: name of the docker e.g. "gatk"
     @return:
     """
     docker_name = os.path.basename(docker_name)
-    docker_dir = "../docker"
+    docker_dir = os.path.join(repo_dir, "docker")
     latest_tag = "NA"
 
     for docker_im_dir in os.listdir(docker_dir):

From fd1db31ce11baf94faf7d3d3ba946e257a0da63a Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 26 Oct 2023 14:27:25 -0400
Subject: [PATCH 15/18] renamed docker_usage_md.py to
 generate_docker_usage_md.py. added generate_docker_usage_md.py to repo site
 by way of adding it to git_page.yml

---
 .github/workflows/git_page.yml                                  | 2 ++
 .../{docker_usage_md.py => generate_docker_usage_md.py}         | 0
 2 files changed, 2 insertions(+)
 rename scripts/git_page/{docker_usage_md.py => generate_docker_usage_md.py} (100%)

diff --git a/.github/workflows/git_page.yml b/.github/workflows/git_page.yml
index 89558586e..6737883a5 100644
--- a/.github/workflows/git_page.yml
+++ b/.github/workflows/git_page.yml
@@ -45,6 +45,8 @@ jobs:
           
           python3 ./scripts/git_page/add_dot_link_to_md.py --md_dir ./docs/workflows --dot_dir ./docs/dot
           
+          python ./scripts/git_page/generate_docker_usage_md.py --output_path ./docs/development_guide
+          
           mkdocs build
       - name: Deploy
         uses: JamesIves/github-pages-deploy-action@releases/v4
diff --git a/scripts/git_page/docker_usage_md.py b/scripts/git_page/generate_docker_usage_md.py
similarity index 100%
rename from scripts/git_page/docker_usage_md.py
rename to scripts/git_page/generate_docker_usage_md.py

From 5a2c250fb6a494cf466afff3abc22e9c43d7f67a Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Thu, 26 Oct 2023 14:38:13 -0400
Subject: [PATCH 16/18] edited markdown title, removed links in header because
 doesn't work if output file directed to a different location from current dir

---
 scripts/git_page/generate_docker_usage_md.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/git_page/generate_docker_usage_md.py b/scripts/git_page/generate_docker_usage_md.py
index e455df5a0..1a78c4338 100644
--- a/scripts/git_page/generate_docker_usage_md.py
+++ b/scripts/git_page/generate_docker_usage_md.py
@@ -73,13 +73,13 @@ def tsv_to_markdown(tsv_file: str) -> str:
 
 def write_docker_usage_to_markdown(resolved_markdown_file, markdown_table):
     with open(resolved_markdown_file, "w") as md_file:
-        md_file.write("# Docker Usage\n\n")
+        md_file.write("# Docker Usage Summary\n\n")
         md_file.write("The following table lists the docker images used in the "
                       "workflows in this repository.\n\n")
-        md_file.write("The table is generated by running the [docker_usage_sum.py]("
-                      "../docker/docker_usage_sum.py) script.\n\n")
-        md_file.write("The script is run by the [generate_docker_usage_md.py]("
-                      "../git_page/generate_docker_usage_md.py) script.\n\n")
+        md_file.write("The table is generated by running the docker_usage_sum.py"
+                      " script.\n\n")
+        md_file.write("The script is run by the ../git_page/generate_docker_usage_md.py"
+                      " script.\n\n")
         md_file.write(markdown_table)
 
 

From aa50c394b26a4629ec1466a8da39c587c3be51cf Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 1 Nov 2023 16:34:06 -0400
Subject: [PATCH 17/18] refactored main function

---
 scripts/docker/docker_usage_sum.py | 108 ++++++++++++++++++-----------
 1 file changed, 68 insertions(+), 40 deletions(-)

diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py
index 557ac0f47..446c0f85d 100644
--- a/scripts/docker/docker_usage_sum.py
+++ b/scripts/docker/docker_usage_sum.py
@@ -47,49 +47,19 @@ def main():
         os.rename(OUT_SUMMARY_TSV, OUT_SUMMARY_TSV + ".bak")
 
     wdl_files = get_wdl_files(dir_to_wdls=WDLS_DIR)
-    global_docker_info = []
-
-    total_files = len(wdl_files)  # Used for Progression calculation
-
-    for index, wdl_path in enumerate(wdl_files, start=1):
-
-        wdl_name = wdl_path
-
-        with open(wdl_path, "r") as file_content:
-            content = file_content.read()
-            pattern = re.compile(r'\s*docker:\s*"')
-            if pattern.search(content):  # If wdl file contains "docker:"
-                matched_lines = []
-                file_content.seek(0)
-                lines = file_content.readlines()
-
-                for line_number, line in enumerate(lines, start=1):
-                    if pattern.search(line):
-                        matched_lines.append((line_number, line.strip()))
-
-                docker_info: list[str] = get_docker_info_from_string(
-                    wdl_lines=matched_lines, wdl_path=wdl_name, repo_dir=repo_dir
-                )
-
-                sorted_info: list = sorted(docker_info, reverse=False)
 
-                global_docker_info.append(sorted_info)
-            else:
-                pass
+    pattern = re.compile(r'^\s*docker:\s*"')
+    global_docker_info = process_wdl_files(
+        wdl_files=wdl_files, pattern=pattern, repo_dir=repo_dir
+    )
 
-        # Progression
-        # Calculate the percentage completion
-        progress: float = (index + 1) / total_files * 100
+    # Remove empty elements in list
+    non_empty_docker_info = [x for x in global_docker_info if x]
 
-        # Clear the previous line and print the progress
-        print(f"Progress: {progress:.2f}%\r", end="")
-    with open(OUT_SUMMARY_TSV, "w") as tsv_file:
-        # Add header
-        tsv_file.write("DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH")
-        # Add content
-        for docker_info_line in sorted(global_docker_info):
-            delimiter = "\n"
-            tsv_file.write(delimiter.join(docker_info_line) + "\n")
+    write_docker_info_to_tsv(
+        output_summary_tsv=OUT_SUMMARY_TSV,
+        docker_info=non_empty_docker_info
+    )
 
     print(f"DONE. PLEASE CHECKOUT TSV FILE: {OUT_SUMMARY_TSV}")
 
@@ -381,5 +351,63 @@ def get_latest_local_docker_tag(docker_name: str, repo_dir: str) -> str:
     return latest_tag
 
 
+def process_wdl_files(wdl_files: list[str], pattern, repo_dir: str):
+    """
+    Returns a list of docker info
+    @param wdl_files: list of wdl files
+    @param pattern: pattern to search for
+    @param repo_dir: directory of the repo
+    @return:
+    """
+    global_docker_info = []
+    total_files = len(wdl_files)
+
+    for index, wdl_path in enumerate(wdl_files, start=1):
+        wdl_name = wdl_path
+        matched_lines = []
+
+        with open(wdl_path, "r") as file_content:
+            lines = file_content.readlines()
+            for line_number, line in enumerate(lines, start=1):
+                if pattern.search(line):
+                    matched_lines.append((line_number, line.strip()))
+
+        docker_info: list[str] = get_docker_info_from_string(
+            wdl_lines=matched_lines, wdl_path=wdl_name, repo_dir=repo_dir
+        )
+
+        sorted_info: list = sorted(docker_info, reverse=False)
+        global_docker_info.append(sorted_info)
+
+        # Visual progression to show percentage of files processed
+        progress: float = (index + 1) / total_files * 100
+
+        # Clear the previous line and print the progress
+        print(f"Progress: {progress:.2f}%\r", end="")
+
+    return global_docker_info
+
+
+def write_docker_info_to_tsv(
+    output_summary_tsv: str, docker_info: list[list]
+):
+    """
+    Writes docker info to tsv file
+
+    @param output_summary_tsv:
+    @param docker_info:
+    @return:
+    """
+
+    with open(output_summary_tsv, "w") as tsv_file:
+        # Add header
+        tsv_file.write("DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH\n")
+        # Add content
+        for docker_info_line in sorted(docker_info):
+            delimiter = "\n"
+            tsv_file.write(delimiter.join(docker_info_line) + "\n")
+
+
+
 if __name__ == "__main__":
     main()

From 85621db42fd06b639c6f0fbb4391165fe9c5c834 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Wed, 1 Nov 2023 16:34:22 -0400
Subject: [PATCH 18/18] fix docker line

---
 wdl/tasks/Utility/Utils.wdl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wdl/tasks/Utility/Utils.wdl b/wdl/tasks/Utility/Utils.wdl
index 754eda331..62ee671e2 100644
--- a/wdl/tasks/Utility/Utils.wdl
+++ b/wdl/tasks/Utility/Utils.wdl
@@ -2258,7 +2258,9 @@ task StopWorkflow {
     command <<<
         echo -e "Workflow explicitly stopped because \n  ~{reason}." && exit 1
     >>>
-    runtime {docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest"}
+    runtime {
+        docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest"
+    }
 }
 
 task InferSampleName {