Skip to content

Commit

Permalink
KEP-2170: Add unit and E2E tests for model and dataset initializers
Browse files Browse the repository at this point in the history
Signed-off-by: wei-chenglai <[email protected]>
  • Loading branch information
seanlaii committed Nov 9, 2024
1 parent 95be3c0 commit f4167e5
Show file tree
Hide file tree
Showing 17 changed files with 776 additions and 1 deletion.
1 change: 1 addition & 0 deletions .github/workflows/integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ jobs:
run: |
pip install pytest
python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
pytest pkg/initializer_v2/test/e2e
env:
GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}

Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/test-python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,6 @@ jobs:
pip install -U './sdk/python[huggingface]'
- name: Run unit test for training sdk
run: pytest ./sdk/python/kubeflow/training/api/training_client_test.py
run: |
pytest ./sdk/python/kubeflow/training/api/training_client_test.py
pytest ./pkg/initializer_v2/test/unit
Empty file.
52 changes: 52 additions & 0 deletions pkg/initializer_v2/test/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import sys

import pytest

# Add project root to path if needed
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))


@pytest.fixture
def mock_env_vars():
"""Fixture to set and clean up environment variables"""
original_env = dict(os.environ)

def _set_env_vars(**kwargs):
for key, value in kwargs.items():
if value is None:
os.environ.pop(key, None)
else:
os.environ[key] = str(value)
return os.environ

yield _set_env_vars

# Cleanup
os.environ.clear()
os.environ.update(original_env)


@pytest.fixture
def huggingface_model_instance():
"""Fixture for HuggingFace Model instance"""
from pkg.initializer_v2.model.huggingface import HuggingFace

return HuggingFace()


@pytest.fixture
def huggingface_dataset_instance():
"""Fixture for HuggingFace Dataset instance"""
from pkg.initializer_v2.dataset.huggingface import HuggingFace

return HuggingFace()


@pytest.fixture
def real_hf_token():
"""Fixture to provide real HuggingFace token for E2E tests"""
token = os.getenv("HUGGINGFACE_TOKEN")
# if not token:
# pytest.skip("HUGGINGFACE_TOKEN environment variable not set")
return token
Empty file.
107 changes: 107 additions & 0 deletions pkg/initializer_v2/test/e2e/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import os
import runpy
import shutil
import tempfile

import pytest

import pkg.initializer_v2.utils.utils as utils
from sdk.python.kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET


class TestDatasetE2E:
"""E2E tests for dataset initialization"""

@pytest.fixture(autouse=True)
def setup_teardown(self, monkeypatch):
"""Setup and teardown for each test"""
# Create temporary directory for dataset downloads
current_dir = os.path.dirname(os.path.abspath(__file__))
self.temp_dir = tempfile.mkdtemp(dir=current_dir)
os.environ[VOLUME_PATH_DATASET] = self.temp_dir

# Store original environment
self.original_env = dict(os.environ)

# Monkeypatch the constant in the module
import sdk.python.kubeflow.storage_initializer.constants as constants

monkeypatch.setattr(constants, "VOLUME_PATH_DATASET", self.temp_dir)

yield

# Cleanup
shutil.rmtree(self.temp_dir, ignore_errors=True)
os.environ.clear()
os.environ.update(self.original_env)

def verify_dataset_files(self, expected_files):
"""Verify downloaded dataset files"""
if expected_files:
actual_files = set(os.listdir(self.temp_dir))
missing_files = set(expected_files) - actual_files
assert not missing_files, f"Missing expected files: {missing_files}"

@pytest.mark.parametrize(
"test_name, provider, test_case",
[
# Public HuggingFace dataset test
(
"HuggingFace - Public dataset",
"huggingface",
{
"storage_uri": "hf://karpathy/tiny_shakespeare",
"access_token": None,
"expected_files": ["tiny_shakespeare.py"],
"expected_error": None,
},
),
# Private HuggingFace dataset test
# (
# "HuggingFace - Private dataset",
# "huggingface",
# {
# "storage_uri": "hf://username/private-dataset",
# "use_real_token": True,
# "expected_files": ["config.json", "dataset.safetensors"],
# "expected_error": None
# }
# ),
# Invalid HuggingFace dataset test
(
"HuggingFace - Invalid dataset",
"huggingface",
{
"storage_uri": "hf://invalid/nonexistent-dataset",
"access_token": None,
"expected_files": None,
"expected_error": Exception,
},
),
],
)
def test_dataset_download(self, test_name, provider, test_case, real_hf_token):
"""Test end-to-end dataset download for different providers"""
print(f"\nRunning E2E test for {provider}: {test_name}")

# Setup environment variables based on test case
os.environ[utils.STORAGE_URI_ENV] = test_case["storage_uri"]
expected_files = test_case.get("expected_files")

# Handle token/credentials
if test_case.get("use_real_token"):
os.environ["ACCESS_TOKEN"] = real_hf_token
elif test_case.get("access_token"):
os.environ["ACCESS_TOKEN"] = test_case["access_token"]

# Run the main script
if test_case["expected_error"]:
with pytest.raises(test_case["expected_error"]):
runpy.run_module(
"pkg.initializer_v2.dataset.__main__", run_name="__main__"
)
else:
runpy.run_module("pkg.initializer_v2.dataset.__main__", run_name="__main__")
self.verify_dataset_files(expected_files)

print("Test execution completed")
113 changes: 113 additions & 0 deletions pkg/initializer_v2/test/e2e/test_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
import runpy
import shutil
import tempfile

import pytest

import pkg.initializer_v2.utils.utils as utils
from sdk.python.kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL


class TestModelE2E:
"""E2E tests for model initialization"""

@pytest.fixture(autouse=True)
def setup_teardown(self, monkeypatch):
"""Setup and teardown for each test"""
# Create temporary directory for model downloads
current_dir = os.path.dirname(os.path.abspath(__file__))
self.temp_dir = tempfile.mkdtemp(dir=current_dir)
print(self.temp_dir)
os.environ[VOLUME_PATH_MODEL] = self.temp_dir

# Store original environment
self.original_env = dict(os.environ)

# Monkeypatch the constant in the module
import sdk.python.kubeflow.storage_initializer.constants as constants

monkeypatch.setattr(constants, "VOLUME_PATH_MODEL", self.temp_dir)

yield

# Cleanup
shutil.rmtree(self.temp_dir, ignore_errors=True)
os.environ.clear()
os.environ.update(self.original_env)

def verify_model_files(self, expected_files):
"""Verify downloaded model files"""
if expected_files:
actual_files = set(os.listdir(self.temp_dir))
missing_files = set(expected_files) - actual_files
assert not missing_files, f"Missing expected files: {missing_files}"

@pytest.mark.parametrize(
"test_name, provider, test_case",
[
# Public HuggingFace model test
(
"HuggingFace - Public model",
"huggingface",
{
"storage_uri": "hf://hf-internal-testing/tiny-random-bert",
"access_token": None,
"expected_files": [
"config.json",
"model.safetensors",
"tokenizer.json",
"tokenizer_config.json",
],
"expected_error": None,
},
),
# Private HuggingFace model test
# (
# "HuggingFace - Private model",
# "huggingface",
# {
# "storage_uri": "hf://username/private-model",
# "use_real_token": True,
# "expected_files": ["config.json", "model.safetensors"],
# "expected_error": None
# }
# ),
# Invalid HuggingFace model test
(
"HuggingFace - Invalid model",
"huggingface",
{
"storage_uri": "hf://invalid/nonexistent-model",
"access_token": None,
"expected_files": None,
"expected_error": Exception,
},
),
],
)
def test_model_download(self, test_name, provider, test_case, real_hf_token):
"""Test end-to-end model download for different providers"""
print(f"\nRunning E2E test for {provider}: {test_name}")

# Setup environment variables based on test case
os.environ[utils.STORAGE_URI_ENV] = test_case["storage_uri"]
expected_files = test_case.get("expected_files")

# Handle token/credentials
if test_case.get("use_real_token"):
os.environ["ACCESS_TOKEN"] = real_hf_token
elif test_case.get("access_token"):
os.environ["ACCESS_TOKEN"] = test_case["access_token"]

# Run the main script
if test_case["expected_error"]:
with pytest.raises(test_case["expected_error"]):
runpy.run_module(
"pkg.initializer_v2.model.__main__", run_name="__main__"
)
else:
runpy.run_module("pkg.initializer_v2.model.__main__", run_name="__main__")
self.verify_model_files(expected_files)

print("Test execution completed")
Empty file.
Empty file.
86 changes: 86 additions & 0 deletions pkg/initializer_v2/test/unit/dataset/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import runpy
from unittest.mock import MagicMock, patch

import pytest


@pytest.mark.parametrize(
"test_name, test_case",
[
(
"Successful download with HuggingFace provider",
{
"storage_uri": "hf://dataset/path",
"access_token": "test_token",
"mock_config_error": False,
"expected_error": None,
},
),
(
"Missing storage URI environment variable",
{
"storage_uri": None,
"access_token": None,
"mock_config_error": False,
"expected_error": Exception,
},
),
(
"Invalid storage URI scheme",
{
"storage_uri": "invalid://dataset/path",
"access_token": None,
"mock_config_error": False,
"expected_error": Exception,
},
),
(
"Config loading failure",
{
"storage_uri": "hf://dataset/path",
"access_token": None,
"mock_config_error": True,
"expected_error": Exception,
},
),
],
)
def test_dataset_main(test_name, test_case, mock_env_vars):
"""Test main script with different scenarios"""
print(f"Running test: {test_name}")

# Setup mock environment variables
env_vars = {
"STORAGE_URI": test_case["storage_uri"],
"ACCESS_TOKEN": test_case["access_token"],
}
mock_env_vars(**env_vars)

# Setup mock HuggingFace instance
mock_hf_instance = MagicMock()
if test_case["mock_config_error"]:
mock_hf_instance.load_config.side_effect = Exception

with patch(
"pkg.initializer_v2.dataset.huggingface.HuggingFace",
return_value=mock_hf_instance,
) as mock_hf:

# Execute test
if test_case["expected_error"]:
with pytest.raises(test_case["expected_error"]):
runpy.run_module(
"pkg.initializer_v2.dataset.__main__", run_name="__main__"
)
else:
runpy.run_module("pkg.initializer_v2.dataset.__main__", run_name="__main__")

# Verify HuggingFace instance methods were called
mock_hf_instance.load_config.assert_called_once()
mock_hf_instance.download_dataset.assert_called_once()

# Verify HuggingFace class instantiation
if test_case["storage_uri"] and test_case["storage_uri"].startswith("hf://"):
mock_hf.assert_called_once()

print("Test execution completed")
16 changes: 16 additions & 0 deletions pkg/initializer_v2/test/unit/dataset/test_dataset_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pkg.initializer_v2.dataset.config import HuggingFaceDatasetConfig


def test_huggingface_dataset_config_creation():
"""Test HuggingFaceModelInputConfig creation with different parameters"""
# Test with required parameters only
config = HuggingFaceDatasetConfig(storage_uri="hf://dataset/path")
assert config.storage_uri == "hf://dataset/path"
assert config.access_token is None

# Test with all parameters
config = HuggingFaceDatasetConfig(
storage_uri="hf://dataset/path", access_token="dummy_token"
)
assert config.storage_uri == "hf://dataset/path"
assert config.access_token == "dummy_token"
Loading

0 comments on commit f4167e5

Please sign in to comment.