From fa372b0d1091289667fb590a376c436ff13911de Mon Sep 17 00:00:00 2001
From: Matthew G McGovern <mamcgove@microsoft.com>
Date: Mon, 11 Nov 2024 09:50:12 -0800
Subject: [PATCH 1/5] urlparse tar fix

Fixing one bug uncovered a few others.
Fixes the way the Tar tool handles fetching
the filename of the tar file it downloads.
---
 microsoft/testsuites/dpdk/common.py   | 19 +++++++++++++------
 microsoft/testsuites/dpdk/dpdkutil.py |  2 +-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/microsoft/testsuites/dpdk/common.py b/microsoft/testsuites/dpdk/common.py
index b7c54bded1..1354dcb3ed 100644
--- a/microsoft/testsuites/dpdk/common.py
+++ b/microsoft/testsuites/dpdk/common.py
@@ -3,6 +3,7 @@
 
 from pathlib import PurePath
 from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
+from urllib.parse import urlparse
 
 from assertpy import assert_that
 from semver import VersionInfo
@@ -126,7 +127,6 @@ def download(self) -> PurePath:
         for suffix in [".tar.gz", ".tar.bz2", ".tar"]:
             if self._tar_url.endswith(suffix):
                 is_tarball = True
-                tarfile_suffix = suffix
                 break
         assert_that(is_tarball).described_as(
             (
@@ -137,7 +137,6 @@ def download(self) -> PurePath:
         if self._is_remote_tarball:
             tarfile = node.tools[Wget].get(
                 self._tar_url,
-                file_path=str(work_path),
                 overwrite=False,
             )
             remote_path = node.get_pure_path(tarfile)
@@ -149,10 +148,9 @@ def download(self) -> PurePath:
                 local_path=PurePath(self._tar_url),
                 node_path=remote_path,
             )
+        tar_root_folder = node.tools[Tar].get_root_folder(str(remote_path))
         # create tarfile dest dir
-        self.asset_path = work_path.joinpath(
-            self.tar_filename[: -(len(tarfile_suffix))]
-        )
+        self.asset_path = work_path.joinpath(tar_root_folder)
         # unpack into the dest dir
         # force name as tarfile name
         node.tools[Tar].extract(
@@ -350,7 +348,16 @@ def check_dpdk_support(node: Node) -> None:
 
 
 def is_url_for_tarball(url: str) -> bool:
-    return ".tar" in PurePath(url).suffixes
+    # fetch the resource from the url
+    # ex. get example/thing.tar from www.github.com/example/thing.tar.gz
+    url_path = urlparse(url).path
+    if not url_path:
+        return False
+    suffixes = PurePath(url_path).suffixes
+    if not suffixes:
+        return False
+    # check if '.tar' in [ '.tar', '.gz' ]
+    return ".tar" in suffixes
 
 
 def is_url_for_git_repo(url: str) -> bool:
diff --git a/microsoft/testsuites/dpdk/dpdkutil.py b/microsoft/testsuites/dpdk/dpdkutil.py
index 71753a730b..9310ce0394 100644
--- a/microsoft/testsuites/dpdk/dpdkutil.py
+++ b/microsoft/testsuites/dpdk/dpdkutil.py
@@ -135,7 +135,7 @@ def get_rdma_core_installer(
         if is_url_for_git_repo(rdma_source):
             # else, if we have a user provided rdma-core source, use it
             downloader: Downloader = GitDownloader(node, rdma_source, rdma_branch)
-        elif is_url_for_tarball(rdma_branch):
+        elif is_url_for_tarball(rdma_source):
             downloader = TarDownloader(node, rdma_source)
         else:
             # throw on unrecognized rdma core source type

From f996ba6dafefc940c60b3692280a4f34f9b1a76a Mon Sep 17 00:00:00 2001
From: Matthew G McGovern <mamcgove@microsoft.com>
Date: Mon, 11 Nov 2024 11:28:11 -0800
Subject: [PATCH 2/5] Wget: add caching

---
 lisa/base_tools/wget.py             | 34 ++++++++++++++++++++++++-----
 microsoft/testsuites/dpdk/common.py |  3 +--
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/lisa/base_tools/wget.py b/lisa/base_tools/wget.py
index 160c5d0133..98ee17db7e 100644
--- a/lisa/base_tools/wget.py
+++ b/lisa/base_tools/wget.py
@@ -1,5 +1,5 @@
 import re
-from typing import TYPE_CHECKING, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type
 from urllib.parse import urlparse
 
 from retry import retry
@@ -24,6 +24,10 @@ class Wget(Tool):
     def command(self) -> str:
         return "wget"
 
+    def _initialize(self, *args: Any, **kwargs: Any) -> None:
+        self._url_file_cache: Dict[str, str] = dict()
+        return super()._initialize(*args, **kwargs)
+
     @property
     def can_install(self) -> bool:
         return True
@@ -45,8 +49,19 @@ def get(
         force_run: bool = False,
         timeout: int = 600,
     ) -> str:
+        cached_filename = self._url_file_cache.get(url, None)
+        if cached_filename:
+            if force_run:
+                del self._url_file_cache[url]
+            else:
+                return cached_filename
+
         is_valid_url(url)
 
+        if not filename:
+            filename = urlparse(url).path.split("/")[-1]
+            self._log.debug(f"filename is not provided, use {filename} from url.")
+
         file_path, download_path = self._ensure_download_path(file_path, filename)
 
         # remove existing file and dir to download again.
@@ -84,6 +99,7 @@ def get(
                     f" stdout: {command_result.stdout}"
                     f" templog: {temp_log}"
                 )
+            self.node.tools[Rm].remove_file(log_file, sudo=sudo)
         else:
             download_file_path = download_path
 
@@ -91,7 +107,7 @@ def get(
             raise LisaTimeoutException(
                 f"wget command is timed out after {timeout} seconds."
             )
-        actual_file_path = self.node.execute(
+        ls_result = self.node.execute(
             f"ls {download_file_path}",
             shell=True,
             sudo=sudo,
@@ -99,10 +115,11 @@ def get(
             expected_exit_code_failure_message="File path does not exist, "
             f"{download_file_path}",
         )
+        actual_file_path = ls_result.stdout.strip()
+        self._url_file_cache[url] = actual_file_path
         if executable:
             self.node.execute(f"chmod +x {actual_file_path}", sudo=sudo)
-        self.node.tools[Rm].remove_file(log_file, sudo=sudo)
-        return actual_file_path.stdout
+        return actual_file_path
 
     def verify_internet_access(self) -> bool:
         try:
@@ -155,6 +172,13 @@ def get(
         force_run: bool = False,
         timeout: int = 600,
     ) -> str:
+        cached_filename = self._url_file_cache.get(url, None)
+        if cached_filename:
+            if force_run:
+                del self._url_file_cache[url]
+            else:
+                return cached_filename
+
         ls = self.node.tools[Ls]
 
         if not filename:
@@ -182,5 +206,5 @@ def get(
             force_run=force_run,
             timeout=timeout,
         )
-
+        self._url_file_cache[url] = download_path
         return download_path
diff --git a/microsoft/testsuites/dpdk/common.py b/microsoft/testsuites/dpdk/common.py
index 1354dcb3ed..6269925b24 100644
--- a/microsoft/testsuites/dpdk/common.py
+++ b/microsoft/testsuites/dpdk/common.py
@@ -136,8 +136,7 @@ def download(self) -> PurePath:
         ).is_true()
         if self._is_remote_tarball:
             tarfile = node.tools[Wget].get(
-                self._tar_url,
-                overwrite=False,
+                self._tar_url, overwrite=False, file_path=str(node.get_working_path())
             )
             remote_path = node.get_pure_path(tarfile)
             self.tar_filename = remote_path.name

From 252f54a9d65d1c4f53615966d5f1d3e2ffe164c0 Mon Sep 17 00:00:00 2001
From: Matthew G McGovern <mamcgove@microsoft.com>
Date: Tue, 12 Nov 2024 09:00:46 -0800
Subject: [PATCH 3/5] Tar: Allow skip-old-files option

When running Wget.get(..., force_run=False) and Tar.extract it is
useful to allow Tar to skip extracting existing files on the second
pass.

Allow the skip-old-files option, so Tar.extract will not overwrite
existing files in the output directory.

Note: it's important to not use this option if you are providing LISA
with a default filename for your tarballs. This option could silently
allow Tar.extract to not update the contents of a directory with the
newer file contents.

I don't see anyone using that schema now. My apologies to future devs who
find this commit message while debugging that issue.

Tar: chi comments
---
 lisa/tools/tar.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lisa/tools/tar.py b/lisa/tools/tar.py
index d9fa920790..e26b3ee268 100644
--- a/lisa/tools/tar.py
+++ b/lisa/tools/tar.py
@@ -35,6 +35,7 @@ def extract(
         gzip: bool = False,
         sudo: bool = False,
         raise_error: bool = True,
+        skip_existing_files: bool = False,
     ) -> None:
         # create folder when it doesn't exist
         assert_that(strip_components).described_as(
@@ -48,6 +49,21 @@ def extract(
         if strip_components:
             # optionally strip N top level components from a tar file
             tar_cmd += f" --strip-components={strip_components}"
+
+        if skip_existing_files:
+            # NOTE:
+            # This option is for when you are using
+            #  Wget.get(..., force_run=False)
+            #
+            # Do not use this option if:
+            # - You may need to extract multiple versions of a
+            #   given tarball on a node
+            # - You have provided a default output filename to Wget.get
+            #   to fetch the tarball
+            #
+            # This skip-old-files option could silently skip extracting
+            # the second version of the tarball.
+            tar_cmd += " --skip-old-files"
         result = self.run(tar_cmd, shell=True, force_run=True, sudo=sudo)
         if raise_error:
             result.assert_exit_code(

From 57fd240586a1d0a7a98f7723cf0f14b320180ecd Mon Sep 17 00:00:00 2001
From: Matthew G McGovern <mamcgove@microsoft.com>
Date: Tue, 12 Nov 2024 09:08:45 -0800
Subject: [PATCH 4/5] TarDownloader: enable skip_old_files option in tar
 extract

Dpdk: chi comments
---
 microsoft/testsuites/dpdk/common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/microsoft/testsuites/dpdk/common.py b/microsoft/testsuites/dpdk/common.py
index 6269925b24..5a78d134a1 100644
--- a/microsoft/testsuites/dpdk/common.py
+++ b/microsoft/testsuites/dpdk/common.py
@@ -152,10 +152,13 @@ def download(self) -> PurePath:
         self.asset_path = work_path.joinpath(tar_root_folder)
         # unpack into the dest dir
         # force name as tarfile name
+        # add option to skip files which already exist on disk
+        # in the event we have already extracted this specific tar
         node.tools[Tar].extract(
             file=str(remote_path),
             dest_dir=str(work_path),
             gzip=True,
+            skip_existing_files=True,
         )
         return self.asset_path
 

From 701b26710592a9fc2018a8d909b99d57d3cc4b49 Mon Sep 17 00:00:00 2001
From: Matthew G McGovern <mamcgove@microsoft.com>
Date: Tue, 19 Nov 2024 12:12:34 -0800
Subject: [PATCH 5/5] Tar: fix subclass prototype

---
 lisa/tools/tar.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lisa/tools/tar.py b/lisa/tools/tar.py
index e26b3ee268..1da6ce07f2 100644
--- a/lisa/tools/tar.py
+++ b/lisa/tools/tar.py
@@ -143,6 +143,7 @@ def extract(
         gzip: bool = False,
         sudo: bool = False,
         raise_error: bool = True,
+        skip_existing_files: bool = False,
     ) -> None:
         mkdir = self.node.tools[Mkdir]
         mkdir.create_directory(dest_dir)