diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 6563997..e70a926 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -1,8 +1,9 @@ name: Integration test on: - pull_request_review: - types: [submitted, edited] + push: + branches: + - master concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -10,8 +11,6 @@ concurrency: jobs: integration-test: - # Integration test only works while the Pull Request is approved or the source repository is trusted repository. - if: github.event.review.state == 'APPROVED' || github.event.pull_request.head.repo.full_name == 'aliyun/pai-python-sdk' runs-on: ubuntu-latest env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/noxfile.py b/noxfile.py index 244846e..68d4a0d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -50,6 +50,12 @@ def integration(session: Session): if os.environ.get(key, value) is not None } + # set worker to 2 * cpu_count (physical cores) if not specified + if "-n" not in session.posargs and "--numprocesses" not in session.posargs: + pos_args = session.posargs + ["-n", str(os.cpu_count() * 2)] + else: + pos_args = session.posargs + session.run( "pytest", "--cov-config=.coveragerc", @@ -57,7 +63,7 @@ def integration(session: Session): "--cov-report=html", "--cov=pai", os.path.join("tests", "integration"), - *session.posargs, + *pos_args, env=env, ) session.run( diff --git a/pai/estimator.py b/pai/estimator.py index 5441692..6510ee2 100644 --- a/pai/estimator.py +++ b/pai/estimator.py @@ -29,6 +29,9 @@ from datetime import datetime from typing import Any, Dict, List, Optional, Union +from Tea.exceptions import TeaException + +from .api.base import PaginatedResult from .api.entity_base import EntityBaseMixin from .common import ProviderAlibabaPAI, git_utils from .common.consts import INSTANCE_TYPE_LOCAL_GPU, FileSystemInputScheme, JobType @@ -1521,7 +1524,9 @@ def _normalize_name(name: str) -> str: for name, value in self.estimator.hyperparameters.items(): env[_TrainingEnv.ENV_PAI_HPS_PREFIX + _normalize_name(name)] = str(value) user_args.extend(["--" + name, shlex.quote(str(value))]) - env[_TrainingEnv.ENV_PAI_USER_ARGS] = shlex.join(user_args) + env[_TrainingEnv.ENV_PAI_USER_ARGS] = " ".join( + [shlex.quote(v) for v in user_args] + ) env[_TrainingEnv.ENV_PAI_HPS] = json.dumps( {name: str(value) for name, value in self.estimator.hyperparameters.items()} ) @@ -1878,15 +1883,27 @@ def __init__( self._future = None self._stop = False - def _list_logs(self): - page_number, page_offset = 1, 0 - # print training job logs. - while not self._stop: + def _list_logs_api(self, page_number=1): + try: res = self.session.training_job_api.list_logs( self.training_job_id, page_number=page_number, page_size=self.page_size, ) + return res + except TeaException as e: + # hack: Backend service may raise an exception when the training job + # instance is not found. + if e.code == "TRAINING_JOB_INSTANCE_NOT_FOUND": + return PaginatedResult(items=[], total_count=0) + else: + raise e + + def _list_logs(self): + page_number, page_offset = 1, 0 + # print training job logs. + while not self._stop: + res = self._list_logs_api(page_number=page_number) # 1. move to next page if len(res.items) == self.page_size: # print new logs starting from page_offset @@ -1904,11 +1921,7 @@ def _list_logs(self): # When _stop is True, wait and print remaining logs. time.sleep(10) while True: - res = self.session.training_job_api.list_logs( - self.training_job_id, - page_number=page_number, - page_size=self.page_size, - ) + res = self._list_logs_api(page_number=page_number) # There maybe more logs in the next page if len(res.items) == self.page_size: self._print_logs(logs=res.items[page_offset:]) diff --git a/pai/huggingface/model.py b/pai/huggingface/model.py index 7b46b7a..1771470 100644 --- a/pai/huggingface/model.py +++ b/pai/huggingface/model.py @@ -259,7 +259,7 @@ def _get_supported_tf_versions_for_inference(self) -> List[str]: return res def _get_latest_tf_version_for_inference(self) -> str: - """Return the latest Transformers version for inference.""" + """Return the latest transformers version for inference.""" res = self._get_supported_tf_versions_for_inference() return max( res, diff --git a/pai/model.py b/pai/model.py index b631231..f480561 100644 --- a/pai/model.py +++ b/pai/model.py @@ -987,12 +987,18 @@ def _deploy_local( # build command to install requirements if requirements_list: - install_requirements = shlex.join( - ["python", "-m", "pip", "install"] + requirements_list + install_requirements = " ".join( + [ + shlex.quote(s) + for s in ["python", "-m", "pip", "install"] + requirements_list + ] ) elif requirements_path: - install_requirements = shlex.join( - ["python", "-m", "pip", "install", "-r", requirements_path] + install_requirements = " ".join( + [ + shlex.quote(s) + for s in ["python", "-m", "pip", "install", "-r", requirements_path] + ] ) else: install_requirements = ""