From fa372b0d1091289667fb590a376c436ff13911de Mon Sep 17 00:00:00 2001 From: Matthew G McGovern Date: Mon, 11 Nov 2024 09:50:12 -0800 Subject: [PATCH 1/5] urlparse tar fix Fixing one bug uncovered a few others. Fixes the way the Tar tool handles fetching the filename of the tar file it downloads. --- microsoft/testsuites/dpdk/common.py | 19 +++++++++++++------ microsoft/testsuites/dpdk/dpdkutil.py | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/microsoft/testsuites/dpdk/common.py b/microsoft/testsuites/dpdk/common.py index b7c54bded1..1354dcb3ed 100644 --- a/microsoft/testsuites/dpdk/common.py +++ b/microsoft/testsuites/dpdk/common.py @@ -3,6 +3,7 @@ from pathlib import PurePath from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union +from urllib.parse import urlparse from assertpy import assert_that from semver import VersionInfo @@ -126,7 +127,6 @@ def download(self) -> PurePath: for suffix in [".tar.gz", ".tar.bz2", ".tar"]: if self._tar_url.endswith(suffix): is_tarball = True - tarfile_suffix = suffix break assert_that(is_tarball).described_as( ( @@ -137,7 +137,6 @@ def download(self) -> PurePath: if self._is_remote_tarball: tarfile = node.tools[Wget].get( self._tar_url, - file_path=str(work_path), overwrite=False, ) remote_path = node.get_pure_path(tarfile) @@ -149,10 +148,9 @@ def download(self) -> PurePath: local_path=PurePath(self._tar_url), node_path=remote_path, ) + tar_root_folder = node.tools[Tar].get_root_folder(str(remote_path)) # create tarfile dest dir - self.asset_path = work_path.joinpath( - self.tar_filename[: -(len(tarfile_suffix))] - ) + self.asset_path = work_path.joinpath(tar_root_folder) # unpack into the dest dir # force name as tarfile name node.tools[Tar].extract( @@ -350,7 +348,16 @@ def check_dpdk_support(node: Node) -> None: def is_url_for_tarball(url: str) -> bool: - return ".tar" in PurePath(url).suffixes + # fetch the resource from the url + # ex. get example/thing.tar from www.github.com/example/thing.tar.gz + url_path = urlparse(url).path + if not url_path: + return False + suffixes = PurePath(url_path).suffixes + if not suffixes: + return False + # check if '.tar' in [ '.tar', '.gz' ] + return ".tar" in suffixes def is_url_for_git_repo(url: str) -> bool: diff --git a/microsoft/testsuites/dpdk/dpdkutil.py b/microsoft/testsuites/dpdk/dpdkutil.py index 71753a730b..9310ce0394 100644 --- a/microsoft/testsuites/dpdk/dpdkutil.py +++ b/microsoft/testsuites/dpdk/dpdkutil.py @@ -135,7 +135,7 @@ def get_rdma_core_installer( if is_url_for_git_repo(rdma_source): # else, if we have a user provided rdma-core source, use it downloader: Downloader = GitDownloader(node, rdma_source, rdma_branch) - elif is_url_for_tarball(rdma_branch): + elif is_url_for_tarball(rdma_source): downloader = TarDownloader(node, rdma_source) else: # throw on unrecognized rdma core source type From f996ba6dafefc940c60b3692280a4f34f9b1a76a Mon Sep 17 00:00:00 2001 From: Matthew G McGovern Date: Mon, 11 Nov 2024 11:28:11 -0800 Subject: [PATCH 2/5] Wget: add caching --- lisa/base_tools/wget.py | 34 ++++++++++++++++++++++++----- microsoft/testsuites/dpdk/common.py | 3 +-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/lisa/base_tools/wget.py b/lisa/base_tools/wget.py index 160c5d0133..98ee17db7e 100644 --- a/lisa/base_tools/wget.py +++ b/lisa/base_tools/wget.py @@ -1,5 +1,5 @@ import re -from typing import TYPE_CHECKING, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type from urllib.parse import urlparse from retry import retry @@ -24,6 +24,10 @@ class Wget(Tool): def command(self) -> str: return "wget" + def _initialize(self, *args: Any, **kwargs: Any) -> None: + self._url_file_cache: Dict[str, str] = dict() + return super()._initialize(*args, **kwargs) + @property def can_install(self) -> bool: return True @@ -45,8 +49,19 @@ def get( force_run: bool = False, timeout: int = 600, ) -> str: + cached_filename = self._url_file_cache.get(url, None) + if cached_filename: + if force_run: + del self._url_file_cache[url] + else: + return cached_filename + is_valid_url(url) + if not filename: + filename = urlparse(url).path.split("/")[-1] + self._log.debug(f"filename is not provided, use {filename} from url.") + file_path, download_path = self._ensure_download_path(file_path, filename) # remove existing file and dir to download again. @@ -84,6 +99,7 @@ def get( f" stdout: {command_result.stdout}" f" templog: {temp_log}" ) + self.node.tools[Rm].remove_file(log_file, sudo=sudo) else: download_file_path = download_path @@ -91,7 +107,7 @@ def get( raise LisaTimeoutException( f"wget command is timed out after {timeout} seconds." ) - actual_file_path = self.node.execute( + ls_result = self.node.execute( f"ls {download_file_path}", shell=True, sudo=sudo, @@ -99,10 +115,11 @@ def get( expected_exit_code_failure_message="File path does not exist, " f"{download_file_path}", ) + actual_file_path = ls_result.stdout.strip() + self._url_file_cache[url] = actual_file_path if executable: self.node.execute(f"chmod +x {actual_file_path}", sudo=sudo) - self.node.tools[Rm].remove_file(log_file, sudo=sudo) - return actual_file_path.stdout + return actual_file_path def verify_internet_access(self) -> bool: try: @@ -155,6 +172,13 @@ def get( force_run: bool = False, timeout: int = 600, ) -> str: + cached_filename = self._url_file_cache.get(url, None) + if cached_filename: + if force_run: + del self._url_file_cache[url] + else: + return cached_filename + ls = self.node.tools[Ls] if not filename: @@ -182,5 +206,5 @@ def get( force_run=force_run, timeout=timeout, ) - + self._url_file_cache[url] = download_path return download_path diff --git a/microsoft/testsuites/dpdk/common.py b/microsoft/testsuites/dpdk/common.py index 1354dcb3ed..6269925b24 100644 --- a/microsoft/testsuites/dpdk/common.py +++ b/microsoft/testsuites/dpdk/common.py @@ -136,8 +136,7 @@ def download(self) -> PurePath: ).is_true() if self._is_remote_tarball: tarfile = node.tools[Wget].get( - self._tar_url, - overwrite=False, + self._tar_url, overwrite=False, file_path=str(node.get_working_path()) ) remote_path = node.get_pure_path(tarfile) self.tar_filename = remote_path.name From 252f54a9d65d1c4f53615966d5f1d3e2ffe164c0 Mon Sep 17 00:00:00 2001 From: Matthew G McGovern Date: Tue, 12 Nov 2024 09:00:46 -0800 Subject: [PATCH 3/5] Tar: Allow skip-old-files option When running Wget.get(..., force_run=False) and Tar.extract it is useful to allow Tar to skip extracting existing files on the second pass. Allow the skip-old-files option, so Tar.extract will not overwrite existing files in the output directory. Note: it's important to not use this option if you are providing LISA with a default filename for your tarballs. This option could silently allow Tar.extract to not update the contents of a directory with the newer file contents. I don't see anyone using that schema now. My apologies to future devs who find this commit message while debugging that issue. Tar: chi comments --- lisa/tools/tar.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lisa/tools/tar.py b/lisa/tools/tar.py index d9fa920790..e26b3ee268 100644 --- a/lisa/tools/tar.py +++ b/lisa/tools/tar.py @@ -35,6 +35,7 @@ def extract( gzip: bool = False, sudo: bool = False, raise_error: bool = True, + skip_existing_files: bool = False, ) -> None: # create folder when it doesn't exist assert_that(strip_components).described_as( @@ -48,6 +49,21 @@ def extract( if strip_components: # optionally strip N top level components from a tar file tar_cmd += f" --strip-components={strip_components}" + + if skip_existing_files: + # NOTE: + # This option is for when you are using + # Wget.get(..., force_run=False) + # + # Do not use this option if: + # - You may need to extract multiple versions of a + # given tarball on a node + # - You have provided a default output filename to Wget.get + # to fetch the tarball + # + # This skip-old-files option could silently skip extracting + # the second version of the tarball. + tar_cmd += " --skip-old-files" result = self.run(tar_cmd, shell=True, force_run=True, sudo=sudo) if raise_error: result.assert_exit_code( From 57fd240586a1d0a7a98f7723cf0f14b320180ecd Mon Sep 17 00:00:00 2001 From: Matthew G McGovern Date: Tue, 12 Nov 2024 09:08:45 -0800 Subject: [PATCH 4/5] TarDownloader: enable skip_old_files option in tar extract Dpdk: chi comments --- microsoft/testsuites/dpdk/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/microsoft/testsuites/dpdk/common.py b/microsoft/testsuites/dpdk/common.py index 6269925b24..5a78d134a1 100644 --- a/microsoft/testsuites/dpdk/common.py +++ b/microsoft/testsuites/dpdk/common.py @@ -152,10 +152,13 @@ def download(self) -> PurePath: self.asset_path = work_path.joinpath(tar_root_folder) # unpack into the dest dir # force name as tarfile name + # add option to skip files which already exist on disk + # in the event we have already extracted this specific tar node.tools[Tar].extract( file=str(remote_path), dest_dir=str(work_path), gzip=True, + skip_existing_files=True, ) return self.asset_path From 701b26710592a9fc2018a8d909b99d57d3cc4b49 Mon Sep 17 00:00:00 2001 From: Matthew G McGovern Date: Tue, 19 Nov 2024 12:12:34 -0800 Subject: [PATCH 5/5] Tar: fix subclass prototype --- lisa/tools/tar.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lisa/tools/tar.py b/lisa/tools/tar.py index e26b3ee268..1da6ce07f2 100644 --- a/lisa/tools/tar.py +++ b/lisa/tools/tar.py @@ -143,6 +143,7 @@ def extract( gzip: bool = False, sudo: bool = False, raise_error: bool = True, + skip_existing_files: bool = False, ) -> None: mkdir = self.node.tools[Mkdir] mkdir.create_directory(dest_dir)