From 8903b66adbe26fda3e7fa5bdc89ce35017b9a384 Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Wed, 9 Jul 2025 13:47:53 +0900 Subject: [PATCH 01/11] feature: Implement enhanced translation caching system - Enhance cache storage to include original text alongside translation for better debugging and data integrity - Add get_cache_entry() method to retrieve full cache entries with metadata - Implement _translate_with_cache() method in LangProvider for automatic caching This provides a robust caching system that improves performance, reduces API costs, and enhances debugging capabilities for all translation services. --- garak/langproviders/base.py | 111 +++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/garak/langproviders/base.py b/garak/langproviders/base.py index b35124d0f..e0072d265 100644 --- a/garak/langproviders/base.py +++ b/garak/langproviders/base.py @@ -10,8 +10,13 @@ import unicodedata import string import logging +import json +import hashlib +import os +from pathlib import Path from garak.resources.api import nltk from langdetect import detect, DetectorFactory, LangDetectException +from garak import _config _intialized_words = False @@ -136,6 +141,95 @@ def is_meaning_string(text: str) -> bool: from garak.configurable import Configurable +class TranslationCache: + def __init__(self, config_root: dict = None): + # Handle fallback for test configs if config_root is provided + self.source_lang = "en" + self.target_lang = "ja" + self.model_type = "unknown" + + if config_root and isinstance(config_root, dict): + lang_cfg = list(config_root.get("langproviders", {}).values())[0] + self.source_lang = lang_cfg.get("language", "en,ja").split(",")[0] + self.target_lang = lang_cfg.get("language", "en,ja").split(",")[1] + self.model_type = lang_cfg.get("model_type", "unknown") + + cache_dir = _config.transient.cache_dir / "translation" + cache_dir.mkdir(mode=0o740, parents=True, exist_ok=True) + cache_filename = ( + f"translation_cache_{self.source_lang}_{self.target_lang}_{self.model_type}.json" + ) + self.cache_file = cache_dir / cache_filename + logging.info(f"Cache file: {self.cache_file}") + self._cache = self._load_cache() + + def _load_cache(self) -> dict: + if self.cache_file.exists(): + try: + with open(self.cache_file, "r", encoding="utf-8") as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as e: + logging.warning(f"Failed to load translation cache: {e}") + return {} + return {} + + def _save_cache(self): + try: + with open(self.cache_file, "w", encoding="utf-8") as f: + json.dump(self._cache, f, ensure_ascii=False, indent=2) + except IOError as e: + logging.warning(f"Failed to save translation cache: {e}") + + def get_cache_key(self, text: str) -> str: + return hashlib.md5(text.encode("utf-8")).hexdigest() + + def get(self, text: str) -> str | None: + cache_key = self.get_cache_key(text) + cache_entry = self._cache.get(cache_key) + if cache_entry and isinstance(cache_entry, dict): + return cache_entry.get("translation") + elif isinstance(cache_entry, str): + # Backward compatibility with old format + return cache_entry + return None + + def set(self, text: str, translation: str): + cache_key = self.get_cache_key(text) + self._cache[cache_key] = { + "original": text, + "translation": translation, + "source_lang": self.source_lang, + "target_lang": self.target_lang, + "model_type": self.model_type + } + self._save_cache() + + @property + def cache(self): + return self._cache + + @property + def cache_file_path(self): + return self.cache_file + + def get_cache_entry(self, text: str) -> dict | None: + """Get full cache entry including original text and metadata.""" + cache_key = self.get_cache_key(text) + cache_entry = self._cache.get(cache_key) + if cache_entry and isinstance(cache_entry, dict): + return cache_entry + elif isinstance(cache_entry, str): + # Backward compatibility with old format + return { + "original": text, + "translation": cache_entry, + "source_lang": self.source_lang, + "target_lang": self.target_lang, + "model_type": self.model_type + } + return None + + class LangProvider(Configurable): """Base class for objects that provision language""" @@ -147,6 +241,9 @@ def __init__(self, config_root: dict = {}) -> None: self._validate_env_var() + # Use TranslationCache for caching + self.cache = TranslationCache(config_root) + self._load_langprovider() def _load_langprovider(self): @@ -155,6 +252,16 @@ def _load_langprovider(self): def _translate(self, text: str) -> str: raise NotImplementedError + def _translate_with_cache(self, text: str) -> str: + """Translate text with caching support.""" + cached_translation = self.cache.get(text) + if cached_translation is not None: + logging.debug(f"Using cached translation for text: {text[:50]}...") + return cached_translation + translation = self._translate_impl(text) + self.cache.set(text, translation) + return translation + def _get_response(self, input_text: str): translated_lines = [] @@ -189,7 +296,7 @@ def _short_sentence_translate(self, line: str) -> str: if needs_translation: cleaned_line = self._clean_line(line) if cleaned_line: - translated_line = self._translate(cleaned_line) + translated_line = self._translate_with_cache(cleaned_line) translated_lines.append(translated_line) return translated_lines @@ -202,7 +309,7 @@ def _long_sentence_translate(self, line: str) -> str: if self._should_skip_line(cleaned_sentence): translated_lines.append(cleaned_sentence) continue - translated_line = self._translate(cleaned_sentence) + translated_line = self._translate_with_cache(cleaned_sentence) translated_lines.append(translated_line) return translated_lines From ad7872c7380ec656507b4687dd41cdf9f8227c8d Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Wed, 9 Jul 2025 13:50:57 +0900 Subject: [PATCH 02/11] feature: Integrate caching into local translation classes - Update LocalHFTranslator to use _translate_with_cache for automatic caching - Add _translate_impl method to LocalHFTranslator for non-cached translation - Update RivaTranslator, DeepLTranslator, GoogleTranslator to use _translate_with_cache for automatic caching This enables caching for translation services (Riva, DeepL, Google, local) while maintaining existing error handling and retry logic, significantly reducing API costs and improving performance for repeated translations. --- garak/langproviders/local.py | 13 +++++++++++++ garak/langproviders/remote.py | 15 +++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/garak/langproviders/local.py b/garak/langproviders/local.py index 1c7f4b783..a66d7b8aa 100644 --- a/garak/langproviders/local.py +++ b/garak/langproviders/local.py @@ -15,10 +15,18 @@ class Passthru(LangProvider): """Stand-in language provision for pass through / noop""" + def __init__(self, config_root: dict = {}) -> None: + super().__init__(config_root=config_root) + def _load_langprovider(self): pass def _translate(self, text: str) -> str: + # Use _translate_with_cache to enable caching + return self._translate_with_cache(text) + + def _translate_impl(self, text: str) -> str: + """Actual translation implementation without caching.""" return text def get_text( @@ -110,6 +118,11 @@ def _load_langprovider(self): self.tokenizer = MarianTokenizer.from_pretrained(model_name) def _translate(self, text: str) -> str: + # Use _translate_with_cache to enable caching + return self._translate_with_cache(text) + + def _translate_impl(self, text: str) -> str: + """Actual translation implementation without caching.""" if "m2m100" in self.model_name: self.tokenizer.src_lang = self.source_lang diff --git a/garak/langproviders/remote.py b/garak/langproviders/remote.py index dc541f2a8..336d02033 100644 --- a/garak/langproviders/remote.py +++ b/garak/langproviders/remote.py @@ -91,6 +91,11 @@ def _load_langprovider(self): # TODO: consider adding a backoff here and determining if a connection needs to be re-established def _translate(self, text: str) -> str: + # Use _translate_with_cache to enable caching + return self._translate_with_cache(text) + + def _translate_impl(self, text: str) -> str: + """Actual translation implementation without caching.""" try: if self.client is None: self._load_langprovider() @@ -152,6 +157,11 @@ def _load_langprovider(self): self._tested = True def _translate(self, text: str) -> str: + # Use _translate_with_cache to enable caching + return self._translate_with_cache(text) + + def _translate_impl(self, text: str) -> str: + """Actual translation implementation without caching.""" try: return self.client.translate_text( text, source_lang=self._source_lang, target_lang=self._target_lang @@ -230,6 +240,11 @@ def _load_langprovider(self): self._tested = True def _translate(self, text: str) -> str: + # Use _translate_with_cache to enable caching + return self._translate_with_cache(text) + + def _translate_impl(self, text: str) -> str: + """Actual translation implementation without caching.""" retry = 5 while retry > 0: try: From 0dff6a6e05f5a4db6da776de591b4788d2222573 Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Wed, 9 Jul 2025 13:52:04 +0900 Subject: [PATCH 03/11] Add: Add comprehensive tests - Create test subclasses that set required attributes before calling parent constructor - Add comprehensive mocking for API key validation and provider loading - Add integration tests for translation caching system - Test cache persistence between translator instances --- tests/langservice/test_translation_cache.py | 395 ++++++++++++++++++ .../test_translation_cache_integration.py | 129 ++++++ 2 files changed, 524 insertions(+) create mode 100644 tests/langservice/test_translation_cache.py create mode 100644 tests/langservice/test_translation_cache_integration.py diff --git a/tests/langservice/test_translation_cache.py b/tests/langservice/test_translation_cache.py new file mode 100644 index 000000000..a317d6613 --- /dev/null +++ b/tests/langservice/test_translation_cache.py @@ -0,0 +1,395 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import tempfile +import json +import os +from pathlib import Path +from unittest.mock import patch, MagicMock + +from garak.langproviders.base import LangProvider, TranslationCache +from garak.langproviders.local import Passthru + + +class TestTranslationCache: + """Test translation caching functionality.""" + + @pytest.fixture + def temp_cache_dir(self): + """Create a temporary cache directory for testing.""" + with tempfile.TemporaryDirectory() as temp_dir: + yield Path(temp_dir) + + @pytest.fixture + def mock_config(self): + """Mock configuration for testing.""" + return { + "langproviders": {"passthru": {"language": "en,ja", "model_type": "test"}} + } + + def test_cache_initialization(self, temp_cache_dir, mock_config): + """Test that cache is properly initialized.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create a test-specific subclass that properly initializes + class TestPassthru(Passthru): + def __init__(self, config_root={}): + # Set language before calling parent __init__ + self.language = "en,ja" + self.model_type = "test" + super().__init__(config_root) + + translator = TestPassthru(config_root=mock_config) + + # Check that cache directory was created + cache_dir = temp_cache_dir / "translation" + assert cache_dir.exists() + + # Check that cache file was created with correct name + expected_cache_file = cache_dir / "translation_cache_en_ja_test.json" + assert translator.cache.cache_file_path == expected_cache_file + + # Check that cache is initialized as empty dict + assert translator.cache.cache == {} + + def test_cache_save_and_load(self, temp_cache_dir, mock_config): + """Test that cache can be saved and loaded.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create a test-specific subclass that properly initializes + class TestPassthru(Passthru): + def __init__(self, config_root={}): + # Set language before calling parent __init__ + self.language = "en,ja" + self.model_type = "test" + super().__init__(config_root) + + translator = TestPassthru(config_root=mock_config) + + # Add some test data to cache + test_text = "Hello world" + test_translation = "こんにちは世界" + translator.cache.set(test_text, test_translation) + + # Check that cache file was created + assert translator.cache.cache_file_path.exists() + + # Create new translator instance to test loading + translator2 = TestPassthru(config_root=mock_config) + + # Check that cached translation is loaded + cached_result = translator2.cache.get(test_text) + assert cached_result == test_translation + + def test_cache_key_generation(self, temp_cache_dir, mock_config): + """Test that cache keys are generated consistently.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create a test-specific subclass that properly initializes + class TestPassthru(Passthru): + def __init__(self, config_root={}): + # Set language before calling parent __init__ + self.language = "en,ja" + self.model_type = "test" + super().__init__(config_root) + + translator = TestPassthru(config_root=mock_config) + + text1 = "Hello world" + text2 = "Hello world" # Same text + text3 = "Different text" + + key1 = translator.cache.get_cache_key(text1) + key2 = translator.cache.get_cache_key(text2) + key3 = translator.cache.get_cache_key(text3) + + # Same text should have same key + assert key1 == key2 + + # Different text should have different key + assert key1 != key3 + + def test_translate_with_cache(self, temp_cache_dir, mock_config): + """Test that translation uses cache when available.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create a test-specific subclass that properly initializes + class TestPassthru(Passthru): + def __init__(self, config_root={}): + # Set language before calling parent __init__ + self.language = "en,ja" + self.model_type = "test" + super().__init__(config_root) + + translator = TestPassthru(config_root=mock_config) + + test_text = "Hello world" + + # First translation should not be cached + result1 = translator._translate_with_cache(test_text) + assert result1 == test_text # Passthru returns original text + + # Second translation should use cache + with patch.object(translator, "_translate_impl") as mock_translate: + result2 = translator._translate_with_cache(test_text) + # Should not call _translate_impl again + mock_translate.assert_not_called() + assert result2 == test_text + + def test_cache_file_corruption_handling(self, temp_cache_dir, mock_config): + """Test that corrupted cache files are handled gracefully.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create a test-specific subclass that properly initializes + class TestPassthru(Passthru): + def __init__(self, config_root={}): + # Set language before calling parent __init__ + self.language = "en,ja" + self.model_type = "test" + super().__init__(config_root) + + translator = TestPassthru(config_root=mock_config) + + # Create a corrupted cache file + translator.cache.cache_file_path.parent.mkdir(parents=True, exist_ok=True) + with open(translator.cache.cache_file_path, "w") as f: + f.write("invalid json content") + + # Should handle corruption gracefully + translator.cache._cache = translator.cache._load_cache() + assert translator.cache.cache == {} + + def test_cache_with_different_language_pairs(self, temp_cache_dir, mock_config): + """Test that different language pairs use different cache files.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create translator with en->ja + class TestPassthru1(Passthru): + def __init__(self, config_root={}): + self.language = "en,ja" + self.model_type = "test" + super().__init__(config_root) + + translator1 = TestPassthru1(config_root=mock_config) + + # Create translator with ja->en + mock_config_ja_en = { + "langproviders": { + "passthru": {"language": "ja,en", "model_type": "test"} + } + } + + class TestPassthru2(Passthru): + def __init__(self, config_root={}): + self.language = "ja,en" + self.model_type = "test" + super().__init__(config_root) + + translator2 = TestPassthru2(config_root=mock_config_ja_en) + + # Check that different cache files are created + assert ( + translator1.cache.cache_file_path != translator2.cache.cache_file_path + ) + assert "en_ja" in str(translator1.cache.cache_file_path) + assert "ja_en" in str(translator2.cache.cache_file_path) + + def test_cache_with_different_model_types(self): + """Test cache works with different model types.""" + config1 = { + "langproviders": { + "local": { + "language": "en,ja", + "model_type": "local", + "name": "test_model", + } + } + } + config2 = { + "langproviders": { + "remote": { + "language": "en,ja", + "model_type": "remote", + "name": "test_model", + } + } + } + + cache1 = TranslationCache(config1) + cache2 = TranslationCache(config2) + + # Different model types should create different cache files + assert cache1.cache_file_path != cache2.cache_file_path + + # Test caching works for both + cache1.set("hello", "こんにちは") + cache2.set("hello", "こんにちは") + + assert cache1.get("hello") == "こんにちは" + assert cache2.get("hello") == "こんにちは" + + def test_cache_stores_original_text(self): + """Test that cache stores original text along with translation.""" + config = { + "langproviders": { + "local": { + "language": "en,ja", + "model_type": "local", + "name": "test_model", + } + } + } + + cache = TranslationCache(config) + original_text = "Hello world" + translated_text = "こんにちは世界" + + cache.set(original_text, translated_text) + + # Get full cache entry + cache_entry = cache.get_cache_entry(original_text) + assert cache_entry is not None + assert cache_entry["original"] == original_text + assert cache_entry["translation"] == translated_text + assert cache_entry["source_lang"] == "en" + assert cache_entry["target_lang"] == "ja" + assert cache_entry["model_type"] == "local" + + def test_backward_compatibility(self): + """Test backward compatibility with old cache format.""" + config = { + "langproviders": { + "local": { + "language": "en,ja", + "model_type": "local", + "name": "test_model", + } + } + } + + cache = TranslationCache(config) + + # Simulate old cache format (string values) + cache._cache["old_key"] = "old_translation" + + # Should still work with get method + result = cache.get("some_text") # This will return None for non-existent key + assert result is None + + # Should work with get_cache_entry for existing old entries + # Note: This is a bit tricky since we need the original text + # For now, just test that the cache still loads + + def test_remote_translator_cache_initialization(self, temp_cache_dir): + """Test that remote translators work without __init__ methods.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + from garak.langproviders.remote import RivaTranslator, DeeplTranslator, GoogleTranslator + + # Test RivaTranslator + config_riva = { + "langproviders": { + "riva": { + "language": "en,ja", + "model_type": "remote.RivaTranslator", + "api_key": "test_key" + } + } + } + + # Mock API key validation and create test subclass + with patch.object(RivaTranslator, '_validate_env_var'), patch.object(RivaTranslator, '_load_langprovider'): + class TestRivaTranslator(RivaTranslator): + def __init__(self, config_root={}): + self.language = "en,ja" + self.model_type = "remote.RivaTranslator" + super().__init__(config_root) + + translator_riva = TestRivaTranslator(config_root=config_riva) + + # Check that cache is initialized + assert translator_riva.cache is not None + assert "en_ja" in str(translator_riva.cache.cache_file_path) + assert "remote.RivaTranslator" in str(translator_riva.cache.cache_file_path) + + # Test DeeplTranslator + config_deepl = { + "langproviders": { + "deepl": { + "language": "en,ja", + "model_type": "remote.DeeplTranslator", + "api_key": "test_key" + } + } + } + + with patch.object(DeeplTranslator, '_validate_env_var'), patch.object(DeeplTranslator, '_load_langprovider'): + class TestDeeplTranslator(DeeplTranslator): + def __init__(self, config_root={}): + self.language = "en,ja" + self.model_type = "remote.DeeplTranslator" + super().__init__(config_root) + + translator_deepl = TestDeeplTranslator(config_root=config_deepl) + + assert translator_deepl.cache is not None + assert "en_ja" in str(translator_deepl.cache.cache_file_path) + assert "remote.DeeplTranslator" in str(translator_deepl.cache.cache_file_path) + + # Test GoogleTranslator + config_google = { + "langproviders": { + "google": { + "language": "en,ja", + "model_type": "remote.GoogleTranslator", + "api_key": "test_key" + } + } + } + + with patch.object(GoogleTranslator, '_validate_env_var'), patch.object(GoogleTranslator, '_load_langprovider'): + class TestGoogleTranslator(GoogleTranslator): + def __init__(self, config_root={}): + self.language = "en,ja" + self.model_type = "remote.GoogleTranslator" + super().__init__(config_root) + + translator_google = TestGoogleTranslator(config_root=config_google) + + assert translator_google.cache is not None + assert "en_ja" in str(translator_google.cache.cache_file_path) + assert "remote.GoogleTranslator" in str(translator_google.cache.cache_file_path) + + def test_remote_translator_cache_functionality(self, temp_cache_dir): + """Test that remote translators can use cache functionality.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + from garak.langproviders.remote import RivaTranslator + + config = { + "langproviders": { + "riva": { + "language": "en,ja", + "model_type": "remote.RivaTranslator", + "api_key": "test_key" + } + } + } + + with patch.object(RivaTranslator, '_validate_env_var'), patch.object(RivaTranslator, '_load_langprovider'): + class TestRivaTranslator(RivaTranslator): + def __init__(self, config_root={}): + self.language = "en,ja" + self.model_type = "remote.RivaTranslator" + super().__init__(config_root) + + translator = TestRivaTranslator(config_root=config) + + # Test cache functionality + test_text = "Hello world" + test_translation = "こんにちは世界" + + # Set cache manually + translator.cache.set(test_text, test_translation) + + # Verify cache entry + cache_entry = translator.cache.get_cache_entry(test_text) + assert cache_entry is not None + assert cache_entry["original"] == test_text + assert cache_entry["translation"] == test_translation + assert cache_entry["source_lang"] == "en" + assert cache_entry["target_lang"] == "ja" + assert cache_entry["model_type"] == "remote.RivaTranslator" diff --git a/tests/langservice/test_translation_cache_integration.py b/tests/langservice/test_translation_cache_integration.py new file mode 100644 index 000000000..40519440b --- /dev/null +++ b/tests/langservice/test_translation_cache_integration.py @@ -0,0 +1,129 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import tempfile +import os +from pathlib import Path +from unittest.mock import patch + +from garak.langproviders.local import Passthru +from garak.langproviders.base import LangProvider + + +class TestTranslationCacheIntegration: + """Integration test for translation caching functionality.""" + + @pytest.fixture + def temp_cache_dir(self): + """Create a temporary cache directory for testing.""" + with tempfile.TemporaryDirectory() as temp_dir: + yield Path(temp_dir) + + @pytest.fixture + def mock_config(self): + """Mock configuration for testing.""" + return { + "langproviders": {"passthru": {"language": "ja,en", "model_type": "test"}} + } + + def test_get_text_with_cache(self, temp_cache_dir, mock_config): + """Test that get_text method uses cache correctly.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create a test-specific subclass that properly initializes + class TestPassthru(Passthru): + def __init__(self, config_root={}): + # Set language before calling parent __init__ + self.language = "ja,en" + self.model_type = "test" + super().__init__(config_root) + + translator = TestPassthru(config_root=mock_config) + + prompts = [ + "こんにちは", + "おはよう", + "こんにちは", + ] # Japanese text, duplicate + + # First call should translate all prompts + results1 = translator.get_text(prompts) + assert results1 == ["こんにちは", "おはよう", "こんにちは"] + + # Second call should use cache for duplicate + results2 = translator.get_text(prompts) + assert results2 == ["こんにちは", "おはよう", "こんにちは"] + + # Verify cache was used by checking if cache file exists + assert translator.cache.cache_file_path.parent.exists() + + def test_cache_persistence(self, temp_cache_dir, mock_config): + """Test that cache persists between translator instances.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create a test-specific subclass that properly initializes + class TestPassthru(Passthru): + def __init__(self, config_root={}): + # Set language before calling parent __init__ + self.language = "ja,en" + self.model_type = "test" + super().__init__(config_root) + + # Create first translator + translator1 = TestPassthru(config_root=mock_config) + translator1._translate("テストテキスト") + + # Create second translator with same config + translator2 = TestPassthru(config_root=mock_config) + + # Check that cache file is shared + assert ( + translator1.cache.cache_file_path == translator2.cache.cache_file_path + ) + + # Verify cache was loaded + cached_result = translator2.cache.get("テストテキスト") + assert cached_result == "テストテキスト" + + def test_remote_translator_integration(self, temp_cache_dir): + """Test that remote translators work correctly in integration scenarios.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + from garak.langproviders.remote import RivaTranslator + + config = { + "langproviders": { + "riva": { + "language": "en,ja", + "model_type": "remote.RivaTranslator", + "api_key": "test_key" + } + } + } + + # Mock API key validation and create test subclass + with patch.object(RivaTranslator, '_validate_env_var'), patch.object(RivaTranslator, '_load_langprovider'): + class TestRivaTranslator(RivaTranslator): + def __init__(self, config_root={}): + self.language = "en,ja" + self.model_type = "remote.RivaTranslator" + super().__init__(config_root) + + translator = TestRivaTranslator(config_root=config) + + # Test that translator can be instantiated and has cache + assert translator.cache is not None + assert translator.source_lang == "en" + assert translator.target_lang == "ja" + + # Test that cache file path is correctly generated + cache_file_path = translator.cache.cache_file_path + assert "en_ja" in str(cache_file_path) + assert "remote.RivaTranslator" in str(cache_file_path) + + # Test that translator can handle translation requests (mock) + with patch.object(translator, '_translate_impl', return_value="こんにちは世界"): + result = translator._translate_with_cache("Hello world") + assert result == "こんにちは世界" + + # Second call should use cache + result2 = translator._translate_with_cache("Hello world") + assert result2 == "こんにちは世界" From 6645fba6a9015414f0563cf1dc76473b33208f82 Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Wed, 9 Jul 2025 13:53:13 +0900 Subject: [PATCH 04/11] Add: Add documentation for translation caching - Add detailed caching documentation explaining benefits and usage - Document cache file naming convention and storage location --- docs/source/translation.rst | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/source/translation.rst b/docs/source/translation.rst index 7dd7ef15a..95cf72ca1 100644 --- a/docs/source/translation.rst +++ b/docs/source/translation.rst @@ -13,6 +13,41 @@ Limitations - If probes or detectors fail to load, you need may need to choose a smaller local translation model or utilize a remote service. - Translation may add significant execution time to the run depending on resources available. +Translation Caching +------------------ + +Garak implements a translation caching system to improve performance and reduce API costs when using translation services. The caching mechanism automatically stores and retrieves translation results to avoid redundant API calls. + +**How it works:** + +- Each translation pair (source language → target language) and model type combination gets its own cache file +- Cache files are stored in JSON format under the cache directory: ``{cache_dir}/translation/translation_cache_{source_lang}_{target_lang}_{model_type}.json`` +- Translation results are keyed by MD5 hash of the input text for efficient storage and retrieval +- Cache files persist between runs, allowing translations to be reused across multiple garak sessions + +**Benefits:** + +- **Performance**: Significantly reduces translation time for repeated text +- **Cost savings**: Reduces API calls to paid services like DeepL, Google Cloud Translation, and NVIDIA Riva +- **Reliability**: Provides fallback for offline scenarios when cached translations are available +- **Consistency**: Ensures identical translations for the same input text across different runs + +**Cache management:** + +- Cache files are automatically created when translations are performed +- Corrupted cache files are handled gracefully with fallback to empty cache +- Cache files can be manually deleted to force fresh translations +- Cache directory location follows garak's standard cache configuration + +**Supported for all translation services:** + +- Local translation models (Hugging Face) +- DeepL API +- NVIDIA Riva API +- Google Cloud Translation API + +The caching system is transparent to users and requires no additional configuration. It automatically activates when translation services are used. + Supported translation services ------------------------------ From df32c36413db443e84ca5258262197ed25d5eefb Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Tue, 15 Jul 2025 20:25:11 +0900 Subject: [PATCH 05/11] fix: save file name --- docs/source/translation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/translation.rst b/docs/source/translation.rst index 95cf72ca1..efd646a28 100644 --- a/docs/source/translation.rst +++ b/docs/source/translation.rst @@ -20,8 +20,8 @@ Garak implements a translation caching system to improve performance and reduce **How it works:** -- Each translation pair (source language → target language) and model type combination gets its own cache file -- Cache files are stored in JSON format under the cache directory: ``{cache_dir}/translation/translation_cache_{source_lang}_{target_lang}_{model_type}.json`` +- Each translation pair (source language → target language) gets its own cache file +- Cache files are stored in JSON format under the cache directory: ``{cache_dir}/translation/translation_cache_{source_lang}_{target_lang}_{model_type}_{model_name}.json`` - Translation results are keyed by MD5 hash of the input text for efficient storage and retrieval - Cache files persist between runs, allowing translations to be reused across multiple garak sessions From 8ab98e82bc16ee1592cad24ac2cd3df71b4c14ec Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Tue, 15 Jul 2025 20:27:00 +0900 Subject: [PATCH 06/11] fix: Use forward reference for LangProvider type hint in TranslationCache - Changed the type hint for the provider argument in TranslationCache to "LangProvider" (as a string) to safely reference a class defined later in the same file - change save file name - make hash code change --- garak/langproviders/base.py | 38 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/garak/langproviders/base.py b/garak/langproviders/base.py index e0072d265..487dee80b 100644 --- a/garak/langproviders/base.py +++ b/garak/langproviders/base.py @@ -142,22 +142,18 @@ def is_meaning_string(text: str) -> bool: class TranslationCache: - def __init__(self, config_root: dict = None): - # Handle fallback for test configs if config_root is provided - self.source_lang = "en" - self.target_lang = "ja" - self.model_type = "unknown" - - if config_root and isinstance(config_root, dict): - lang_cfg = list(config_root.get("langproviders", {}).values())[0] - self.source_lang = lang_cfg.get("language", "en,ja").split(",")[0] - self.target_lang = lang_cfg.get("language", "en,ja").split(",")[1] - self.model_type = lang_cfg.get("model_type", "unknown") + def __init__(self, provider: "LangProvider"): + self.source_lang = provider.source_lang + self.target_lang = provider.target_lang + self.model_type = provider.model_type + self.model_name = "default" + if hasattr(provider, "model_name"): + self.model_name = provider.model_name cache_dir = _config.transient.cache_dir / "translation" cache_dir.mkdir(mode=0o740, parents=True, exist_ok=True) cache_filename = ( - f"translation_cache_{self.source_lang}_{self.target_lang}_{self.model_type}.json" + f"translation_cache_{self.source_lang}_{self.target_lang}_{self.model_type}_{self.model_name.replace('/', '_')}.json" ) self.cache_file = cache_dir / cache_filename logging.info(f"Cache file: {self.cache_file}") @@ -181,7 +177,7 @@ def _save_cache(self): logging.warning(f"Failed to save translation cache: {e}") def get_cache_key(self, text: str) -> str: - return hashlib.md5(text.encode("utf-8")).hexdigest() + return hashlib.md5(text.encode("utf-8"), usedforsecurity=False).hexdigest() def get(self, text: str) -> str | None: cache_key = self.get_cache_key(text) @@ -200,18 +196,11 @@ def set(self, text: str, translation: str): "translation": translation, "source_lang": self.source_lang, "target_lang": self.target_lang, - "model_type": self.model_type + "model_type": self.model_type, + "model_name": self.model_name } self._save_cache() - @property - def cache(self): - return self._cache - - @property - def cache_file_path(self): - return self.cache_file - def get_cache_entry(self, text: str) -> dict | None: """Get full cache entry including original text and metadata.""" cache_key = self.get_cache_key(text) @@ -225,7 +214,8 @@ def get_cache_entry(self, text: str) -> dict | None: "translation": cache_entry, "source_lang": self.source_lang, "target_lang": self.target_lang, - "model_type": self.model_type + "model_type": self.model_type, + "model_name": self.model_name } return None @@ -242,7 +232,7 @@ def __init__(self, config_root: dict = {}) -> None: self._validate_env_var() # Use TranslationCache for caching - self.cache = TranslationCache(config_root) + self.cache = TranslationCache(self) self._load_langprovider() From 25d9104e17bcf0f5bdda70d9f55779e8631e81eb Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Tue, 15 Jul 2025 20:28:10 +0900 Subject: [PATCH 07/11] fix: remove passthru with cache --- garak/langproviders/local.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/garak/langproviders/local.py b/garak/langproviders/local.py index a66d7b8aa..360eb3e11 100644 --- a/garak/langproviders/local.py +++ b/garak/langproviders/local.py @@ -15,18 +15,11 @@ class Passthru(LangProvider): """Stand-in language provision for pass through / noop""" - def __init__(self, config_root: dict = {}) -> None: - super().__init__(config_root=config_root) - def _load_langprovider(self): pass def _translate(self, text: str) -> str: # Use _translate_with_cache to enable caching - return self._translate_with_cache(text) - - def _translate_impl(self, text: str) -> str: - """Actual translation implementation without caching.""" return text def get_text( From fc262b97ed636a68b713f6f089633e199eedba1b Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Tue, 15 Jul 2025 20:29:22 +0900 Subject: [PATCH 08/11] fix: remove extra test - remove extra pass thru test - check save file name --- tests/langservice/test_translation_cache.py | 374 +++++++----------- .../test_translation_cache_integration.py | 145 +++---- 2 files changed, 213 insertions(+), 306 deletions(-) diff --git a/tests/langservice/test_translation_cache.py b/tests/langservice/test_translation_cache.py index a317d6613..6dda089db 100644 --- a/tests/langservice/test_translation_cache.py +++ b/tests/langservice/test_translation_cache.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import pytest @@ -9,7 +9,6 @@ from unittest.mock import patch, MagicMock from garak.langproviders.base import LangProvider, TranslationCache -from garak.langproviders.local import Passthru class TestTranslationCache: @@ -28,252 +27,102 @@ def mock_config(self): "langproviders": {"passthru": {"language": "en,ja", "model_type": "test"}} } - def test_cache_initialization(self, temp_cache_dir, mock_config): - """Test that cache is properly initialized.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create a test-specific subclass that properly initializes - class TestPassthru(Passthru): - def __init__(self, config_root={}): - # Set language before calling parent __init__ - self.language = "en,ja" - self.model_type = "test" - super().__init__(config_root) - - translator = TestPassthru(config_root=mock_config) - - # Check that cache directory was created - cache_dir = temp_cache_dir / "translation" - assert cache_dir.exists() - - # Check that cache file was created with correct name - expected_cache_file = cache_dir / "translation_cache_en_ja_test.json" - assert translator.cache.cache_file_path == expected_cache_file - - # Check that cache is initialized as empty dict - assert translator.cache.cache == {} - - def test_cache_save_and_load(self, temp_cache_dir, mock_config): - """Test that cache can be saved and loaded.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create a test-specific subclass that properly initializes - class TestPassthru(Passthru): - def __init__(self, config_root={}): - # Set language before calling parent __init__ - self.language = "en,ja" - self.model_type = "test" - super().__init__(config_root) - - translator = TestPassthru(config_root=mock_config) - - # Add some test data to cache - test_text = "Hello world" - test_translation = "こんにちは世界" - translator.cache.set(test_text, test_translation) - - # Check that cache file was created - assert translator.cache.cache_file_path.exists() - - # Create new translator instance to test loading - translator2 = TestPassthru(config_root=mock_config) - - # Check that cached translation is loaded - cached_result = translator2.cache.get(test_text) - assert cached_result == test_translation - - def test_cache_key_generation(self, temp_cache_dir, mock_config): - """Test that cache keys are generated consistently.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create a test-specific subclass that properly initializes - class TestPassthru(Passthru): - def __init__(self, config_root={}): - # Set language before calling parent __init__ - self.language = "en,ja" - self.model_type = "test" - super().__init__(config_root) - - translator = TestPassthru(config_root=mock_config) - - text1 = "Hello world" - text2 = "Hello world" # Same text - text3 = "Different text" - - key1 = translator.cache.get_cache_key(text1) - key2 = translator.cache.get_cache_key(text2) - key3 = translator.cache.get_cache_key(text3) - - # Same text should have same key - assert key1 == key2 - - # Different text should have different key - assert key1 != key3 - - def test_translate_with_cache(self, temp_cache_dir, mock_config): - """Test that translation uses cache when available.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create a test-specific subclass that properly initializes - class TestPassthru(Passthru): - def __init__(self, config_root={}): - # Set language before calling parent __init__ - self.language = "en,ja" - self.model_type = "test" - super().__init__(config_root) - - translator = TestPassthru(config_root=mock_config) - - test_text = "Hello world" - - # First translation should not be cached - result1 = translator._translate_with_cache(test_text) - assert result1 == test_text # Passthru returns original text - - # Second translation should use cache - with patch.object(translator, "_translate_impl") as mock_translate: - result2 = translator._translate_with_cache(test_text) - # Should not call _translate_impl again - mock_translate.assert_not_called() - assert result2 == test_text - - def test_cache_file_corruption_handling(self, temp_cache_dir, mock_config): - """Test that corrupted cache files are handled gracefully.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create a test-specific subclass that properly initializes - class TestPassthru(Passthru): - def __init__(self, config_root={}): - # Set language before calling parent __init__ - self.language = "en,ja" - self.model_type = "test" - super().__init__(config_root) - - translator = TestPassthru(config_root=mock_config) - - # Create a corrupted cache file - translator.cache.cache_file_path.parent.mkdir(parents=True, exist_ok=True) - with open(translator.cache.cache_file_path, "w") as f: - f.write("invalid json content") - - # Should handle corruption gracefully - translator.cache._cache = translator.cache._load_cache() - assert translator.cache.cache == {} - - def test_cache_with_different_language_pairs(self, temp_cache_dir, mock_config): - """Test that different language pairs use different cache files.""" + def test_cache_with_different_model_types(self, temp_cache_dir): + """Test cache works with different model types.""" with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create translator with en->ja - class TestPassthru1(Passthru): - def __init__(self, config_root={}): - self.language = "en,ja" - self.model_type = "test" - super().__init__(config_root) - - translator1 = TestPassthru1(config_root=mock_config) - - # Create translator with ja->en - mock_config_ja_en = { + config1 = { "langproviders": { - "passthru": {"language": "ja,en", "model_type": "test"} - } - } - - class TestPassthru2(Passthru): - def __init__(self, config_root={}): - self.language = "ja,en" - self.model_type = "test" - super().__init__(config_root) - - translator2 = TestPassthru2(config_root=mock_config_ja_en) - - # Check that different cache files are created - assert ( - translator1.cache.cache_file_path != translator2.cache.cache_file_path - ) - assert "en_ja" in str(translator1.cache.cache_file_path) - assert "ja_en" in str(translator2.cache.cache_file_path) - - def test_cache_with_different_model_types(self): - """Test cache works with different model types.""" - config1 = { - "langproviders": { - "local": { - "language": "en,ja", - "model_type": "local", - "name": "test_model", + "local": { + "language": "en,ja", + "model_type": "local", + "name": "test_model", + } } } - } - config2 = { - "langproviders": { - "remote": { - "language": "en,ja", - "model_type": "remote", - "name": "test_model", + config2 = { + "langproviders": { + "remote": { + "language": "en,ja", + "model_type": "remote", + "name": "test_model", + } } } - } - - cache1 = TranslationCache(config1) - cache2 = TranslationCache(config2) - - # Different model types should create different cache files - assert cache1.cache_file_path != cache2.cache_file_path - # Test caching works for both - cache1.set("hello", "こんにちは") - cache2.set("hello", "こんにちは") + # Create mock LangProvider instances + provider1 = MagicMock() + provider1.source_lang = "en" + provider1.target_lang = "ja" + provider1.model_type = "local" + provider1.model_name = "test_model" - assert cache1.get("hello") == "こんにちは" - assert cache2.get("hello") == "こんにちは" + provider2 = MagicMock() + provider2.source_lang = "en" + provider2.target_lang = "ja" + provider2.model_type = "remote" + provider2.model_name = "test_model" - def test_cache_stores_original_text(self): - """Test that cache stores original text along with translation.""" - config = { - "langproviders": { - "local": { - "language": "en,ja", - "model_type": "local", - "name": "test_model", - } - } - } + cache1 = TranslationCache(provider1) + cache2 = TranslationCache(provider2) - cache = TranslationCache(config) - original_text = "Hello world" - translated_text = "こんにちは世界" + # Different model types should create different cache files + assert str(cache1.cache_file) != str(cache2.cache_file) - cache.set(original_text, translated_text) + # Test caching works for both + cache1.set("hello", "こんにちは") + cache2.set("hello", "こんにちは") - # Get full cache entry - cache_entry = cache.get_cache_entry(original_text) - assert cache_entry is not None - assert cache_entry["original"] == original_text - assert cache_entry["translation"] == translated_text - assert cache_entry["source_lang"] == "en" - assert cache_entry["target_lang"] == "ja" - assert cache_entry["model_type"] == "local" + assert cache1.get("hello") == "こんにちは" + assert cache2.get("hello") == "こんにちは" - def test_backward_compatibility(self): + def test_cache_stores_original_text(self, temp_cache_dir): + """Test that cache stores original text along with translation.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create mock LangProvider instance + provider = MagicMock() + provider.source_lang = "en" + provider.target_lang = "ja" + provider.model_type = "local" + provider.model_name = "test_model" + + cache = TranslationCache(provider) + original_text = "Hello world" + translated_text = "こんにちは世界" + + cache.set(original_text, translated_text) + + # Get full cache entry + cache_entry = cache.get_cache_entry(original_text) + assert cache_entry is not None + assert cache_entry["original"] == original_text + assert cache_entry["translation"] == translated_text + assert cache_entry["source_lang"] == "en" + assert cache_entry["target_lang"] == "ja" + assert cache_entry["model_type"] == "local" + assert cache_entry["model_name"] == "test_model" + + def test_backward_compatibility(self, temp_cache_dir): """Test backward compatibility with old cache format.""" - config = { - "langproviders": { - "local": { - "language": "en,ja", - "model_type": "local", - "name": "test_model", - } - } - } + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create mock LangProvider instance + provider = MagicMock() + provider.source_lang = "en" + provider.target_lang = "ja" + provider.model_type = "local" + provider.model_name = "test_model" - cache = TranslationCache(config) + cache = TranslationCache(provider) - # Simulate old cache format (string values) - cache._cache["old_key"] = "old_translation" + # Simulate old cache format (string values) + cache._cache["old_key"] = "old_translation" - # Should still work with get method - result = cache.get("some_text") # This will return None for non-existent key - assert result is None + # Should still work with get method + result = cache.get("some_text") # This will return None for non-existent key + assert result is None - # Should work with get_cache_entry for existing old entries - # Note: This is a bit tricky since we need the original text - # For now, just test that the cache still loads + # Should work with get_cache_entry for existing old entries + # Note: This is a bit tricky since we need the original text + # For now, just test that the cache still loads def test_remote_translator_cache_initialization(self, temp_cache_dir): """Test that remote translators work without __init__ methods.""" @@ -303,8 +152,8 @@ def __init__(self, config_root={}): # Check that cache is initialized assert translator_riva.cache is not None - assert "en_ja" in str(translator_riva.cache.cache_file_path) - assert "remote.RivaTranslator" in str(translator_riva.cache.cache_file_path) + assert "en_ja" in str(translator_riva.cache.cache_file) + assert "remote.RivaTranslator" in str(translator_riva.cache.cache_file) # Test DeeplTranslator config_deepl = { @@ -327,8 +176,8 @@ def __init__(self, config_root={}): translator_deepl = TestDeeplTranslator(config_root=config_deepl) assert translator_deepl.cache is not None - assert "en_ja" in str(translator_deepl.cache.cache_file_path) - assert "remote.DeeplTranslator" in str(translator_deepl.cache.cache_file_path) + assert "en_ja" in str(translator_deepl.cache.cache_file) + assert "remote.DeeplTranslator" in str(translator_deepl.cache.cache_file) # Test GoogleTranslator config_google = { @@ -351,8 +200,8 @@ def __init__(self, config_root={}): translator_google = TestGoogleTranslator(config_root=config_google) assert translator_google.cache is not None - assert "en_ja" in str(translator_google.cache.cache_file_path) - assert "remote.GoogleTranslator" in str(translator_google.cache.cache_file_path) + assert "en_ja" in str(translator_google.cache.cache_file) + assert "remote.GoogleTranslator" in str(translator_google.cache.cache_file) def test_remote_translator_cache_functionality(self, temp_cache_dir): """Test that remote translators can use cache functionality.""" @@ -393,3 +242,56 @@ def __init__(self, config_root={}): assert cache_entry["source_lang"] == "en" assert cache_entry["target_lang"] == "ja" assert cache_entry["model_type"] == "remote.RivaTranslator" + + def test_cache_with_default_model_name(self, temp_cache_dir): + """Test cache works with default model name when model_name is not set.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create mock LangProvider instance without model_name + provider = MagicMock() + provider.source_lang = "en" + provider.target_lang = "ja" + provider.model_type = "local" + provider.model_name = "default_should_be_deleted" + del provider.model_name # 属性自体を削除 + + cache = TranslationCache(provider) + + # Verify default model_name is used + assert cache.model_name == "default" + + # Test cache functionality + test_text = "Hello world" + test_translation = "こんにちは世界" + + cache.set(test_text, test_translation) + + # Verify cache entry includes default model_name + cache_entry = cache.get_cache_entry(test_text) + assert cache_entry is not None + assert cache_entry["model_name"] == "default" + + def test_cache_with_custom_model_name(self, temp_cache_dir): + """Test cache works with custom model name.""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # Create mock LangProvider instance with custom model_name + provider = MagicMock() + provider.source_lang = "en" + provider.target_lang = "ja" + provider.model_type = "local" + provider.model_name = "custom_model" + + cache = TranslationCache(provider) + + # Verify custom model_name is used + assert cache.model_name == "custom_model" + + # Test cache functionality + test_text = "Hello world" + test_translation = "こんにちは世界" + + cache.set(test_text, test_translation) + + # Verify cache entry includes custom model_name + cache_entry = cache.get_cache_entry(test_text) + assert cache_entry is not None + assert cache_entry["model_name"] == "custom_model" diff --git a/tests/langservice/test_translation_cache_integration.py b/tests/langservice/test_translation_cache_integration.py index 40519440b..0d50fe7af 100644 --- a/tests/langservice/test_translation_cache_integration.py +++ b/tests/langservice/test_translation_cache_integration.py @@ -1,15 +1,13 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import pytest import tempfile import os from pathlib import Path -from unittest.mock import patch - -from garak.langproviders.local import Passthru -from garak.langproviders.base import LangProvider +from unittest.mock import patch, MagicMock +from garak.langproviders.base import LangProvider, TranslationCache class TestTranslationCacheIntegration: """Integration test for translation caching functionality.""" @@ -20,70 +18,6 @@ def temp_cache_dir(self): with tempfile.TemporaryDirectory() as temp_dir: yield Path(temp_dir) - @pytest.fixture - def mock_config(self): - """Mock configuration for testing.""" - return { - "langproviders": {"passthru": {"language": "ja,en", "model_type": "test"}} - } - - def test_get_text_with_cache(self, temp_cache_dir, mock_config): - """Test that get_text method uses cache correctly.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create a test-specific subclass that properly initializes - class TestPassthru(Passthru): - def __init__(self, config_root={}): - # Set language before calling parent __init__ - self.language = "ja,en" - self.model_type = "test" - super().__init__(config_root) - - translator = TestPassthru(config_root=mock_config) - - prompts = [ - "こんにちは", - "おはよう", - "こんにちは", - ] # Japanese text, duplicate - - # First call should translate all prompts - results1 = translator.get_text(prompts) - assert results1 == ["こんにちは", "おはよう", "こんにちは"] - - # Second call should use cache for duplicate - results2 = translator.get_text(prompts) - assert results2 == ["こんにちは", "おはよう", "こんにちは"] - - # Verify cache was used by checking if cache file exists - assert translator.cache.cache_file_path.parent.exists() - - def test_cache_persistence(self, temp_cache_dir, mock_config): - """Test that cache persists between translator instances.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create a test-specific subclass that properly initializes - class TestPassthru(Passthru): - def __init__(self, config_root={}): - # Set language before calling parent __init__ - self.language = "ja,en" - self.model_type = "test" - super().__init__(config_root) - - # Create first translator - translator1 = TestPassthru(config_root=mock_config) - translator1._translate("テストテキスト") - - # Create second translator with same config - translator2 = TestPassthru(config_root=mock_config) - - # Check that cache file is shared - assert ( - translator1.cache.cache_file_path == translator2.cache.cache_file_path - ) - - # Verify cache was loaded - cached_result = translator2.cache.get("テストテキスト") - assert cached_result == "テストテキスト" - def test_remote_translator_integration(self, temp_cache_dir): """Test that remote translators work correctly in integration scenarios.""" with patch("garak._config.transient.cache_dir", temp_cache_dir): @@ -115,9 +49,10 @@ def __init__(self, config_root={}): assert translator.target_lang == "ja" # Test that cache file path is correctly generated - cache_file_path = translator.cache.cache_file_path + cache_file_path = translator.cache.cache_file assert "en_ja" in str(cache_file_path) assert "remote.RivaTranslator" in str(cache_file_path) + assert "default" in str(cache_file_path) # Default model_name # Test that translator can handle translation requests (mock) with patch.object(translator, '_translate_impl', return_value="こんにちは世界"): @@ -127,3 +62,73 @@ def __init__(self, config_root={}): # Second call should use cache result2 = translator._translate_with_cache("Hello world") assert result2 == "こんにちは世界" + + def test_local_translator_integration(self, temp_cache_dir): + """Test that local translators work correctly in integration scenarios (mocked, no Passthru).""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + # モックLangProviderサブクラス + class MockLocalProvider(LangProvider): + def __init__(self): + self.language = "en,ja" + self.model_type = "local" + self.model_name = "test_model" + self.source_lang, self.target_lang = self.language.split(",") + self._validate_env_var = lambda: None + self._load_langprovider = lambda: None + self.cache = TranslationCache(self) + def _translate(self, text): + return "" + def _translate_impl(self, text): + return "" + + translator = MockLocalProvider() + + # Test that translator can be instantiated and has cache + assert translator.cache is not None + assert translator.source_lang == "en" + assert translator.target_lang == "ja" + + # Test that cache file path is correctly generated + cache_file_path = translator.cache.cache_file + assert "en_ja" in str(cache_file_path) + assert "local" in str(cache_file_path) + assert "test_model" in str(cache_file_path) + + # Test that translator can handle translation requests (mock) + with patch.object(translator, '_translate_impl', return_value="こんにちは世界"): + result = translator._translate_with_cache("Hello world") + assert result == "こんにちは世界" + + # Second call should use cache + result2 = translator._translate_with_cache("Hello world") + assert result2 == "こんにちは世界" + + def test_cache_persistence_across_sessions(self, temp_cache_dir): + """Test that cache persists across different translator sessions (mocked, no Passthru).""" + with patch("garak._config.transient.cache_dir", temp_cache_dir): + class MockLocalProvider(LangProvider): + def __init__(self): + self.language = "en,ja" + self.model_type = "local" + self.model_name = "test_model" + self.source_lang, self.target_lang = self.language.split(",") + self._validate_env_var = lambda: None + self._load_langprovider = lambda: None + self.cache = TranslationCache(self) + def _translate(self, text): + return "" + # Create first translator instance + translator1 = MockLocalProvider() + # Set cache entry + test_text = "Hello world" + test_translation = "こんにちは世界" + translator1.cache.set(test_text, test_translation) + # Verify cache entry was saved + cache_entry = translator1.cache.get_cache_entry(test_text) + assert cache_entry is not None + assert cache_entry["translation"] == test_translation + # Create second translator instance (simulating new session) + translator2 = MockLocalProvider() + # Verify cache entry is still available + cached_translation = translator2.cache.get(test_text) + assert cached_translation == test_translation From a3b0748e8b68f1167929406149c6257669c2f9d7 Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Tue, 15 Jul 2025 20:34:05 +0900 Subject: [PATCH 09/11] fix: black check --- garak/langproviders/base.py | 8 +- tests/langservice/test_translation_cache.py | 102 +++++++++++------- .../test_translation_cache_integration.py | 46 +++++--- 3 files changed, 97 insertions(+), 59 deletions(-) diff --git a/garak/langproviders/base.py b/garak/langproviders/base.py index 487dee80b..b483e123e 100644 --- a/garak/langproviders/base.py +++ b/garak/langproviders/base.py @@ -152,9 +152,7 @@ def __init__(self, provider: "LangProvider"): cache_dir = _config.transient.cache_dir / "translation" cache_dir.mkdir(mode=0o740, parents=True, exist_ok=True) - cache_filename = ( - f"translation_cache_{self.source_lang}_{self.target_lang}_{self.model_type}_{self.model_name.replace('/', '_')}.json" - ) + cache_filename = f"translation_cache_{self.source_lang}_{self.target_lang}_{self.model_type}_{self.model_name.replace('/', '_')}.json" self.cache_file = cache_dir / cache_filename logging.info(f"Cache file: {self.cache_file}") self._cache = self._load_cache() @@ -197,7 +195,7 @@ def set(self, text: str, translation: str): "source_lang": self.source_lang, "target_lang": self.target_lang, "model_type": self.model_type, - "model_name": self.model_name + "model_name": self.model_name, } self._save_cache() @@ -215,7 +213,7 @@ def get_cache_entry(self, text: str) -> dict | None: "source_lang": self.source_lang, "target_lang": self.target_lang, "model_type": self.model_type, - "model_name": self.model_name + "model_name": self.model_name, } return None diff --git a/tests/langservice/test_translation_cache.py b/tests/langservice/test_translation_cache.py index 6dda089db..6a6bcecd3 100644 --- a/tests/langservice/test_translation_cache.py +++ b/tests/langservice/test_translation_cache.py @@ -117,7 +117,9 @@ def test_backward_compatibility(self, temp_cache_dir): cache._cache["old_key"] = "old_translation" # Should still work with get method - result = cache.get("some_text") # This will return None for non-existent key + result = cache.get( + "some_text" + ) # This will return None for non-existent key assert result is None # Should work with get_cache_entry for existing old entries @@ -127,113 +129,137 @@ def test_backward_compatibility(self, temp_cache_dir): def test_remote_translator_cache_initialization(self, temp_cache_dir): """Test that remote translators work without __init__ methods.""" with patch("garak._config.transient.cache_dir", temp_cache_dir): - from garak.langproviders.remote import RivaTranslator, DeeplTranslator, GoogleTranslator - + from garak.langproviders.remote import ( + RivaTranslator, + DeeplTranslator, + GoogleTranslator, + ) + # Test RivaTranslator config_riva = { "langproviders": { "riva": { "language": "en,ja", "model_type": "remote.RivaTranslator", - "api_key": "test_key" + "api_key": "test_key", } } } - + # Mock API key validation and create test subclass - with patch.object(RivaTranslator, '_validate_env_var'), patch.object(RivaTranslator, '_load_langprovider'): + with ( + patch.object(RivaTranslator, "_validate_env_var"), + patch.object(RivaTranslator, "_load_langprovider"), + ): + class TestRivaTranslator(RivaTranslator): def __init__(self, config_root={}): self.language = "en,ja" self.model_type = "remote.RivaTranslator" super().__init__(config_root) - + translator_riva = TestRivaTranslator(config_root=config_riva) - + # Check that cache is initialized assert translator_riva.cache is not None assert "en_ja" in str(translator_riva.cache.cache_file) assert "remote.RivaTranslator" in str(translator_riva.cache.cache_file) - + # Test DeeplTranslator config_deepl = { "langproviders": { "deepl": { "language": "en,ja", "model_type": "remote.DeeplTranslator", - "api_key": "test_key" + "api_key": "test_key", } } } - - with patch.object(DeeplTranslator, '_validate_env_var'), patch.object(DeeplTranslator, '_load_langprovider'): + + with ( + patch.object(DeeplTranslator, "_validate_env_var"), + patch.object(DeeplTranslator, "_load_langprovider"), + ): + class TestDeeplTranslator(DeeplTranslator): def __init__(self, config_root={}): self.language = "en,ja" self.model_type = "remote.DeeplTranslator" super().__init__(config_root) - + translator_deepl = TestDeeplTranslator(config_root=config_deepl) - + assert translator_deepl.cache is not None assert "en_ja" in str(translator_deepl.cache.cache_file) - assert "remote.DeeplTranslator" in str(translator_deepl.cache.cache_file) - + assert "remote.DeeplTranslator" in str( + translator_deepl.cache.cache_file + ) + # Test GoogleTranslator config_google = { "langproviders": { "google": { "language": "en,ja", "model_type": "remote.GoogleTranslator", - "api_key": "test_key" + "api_key": "test_key", } } } - - with patch.object(GoogleTranslator, '_validate_env_var'), patch.object(GoogleTranslator, '_load_langprovider'): + + with ( + patch.object(GoogleTranslator, "_validate_env_var"), + patch.object(GoogleTranslator, "_load_langprovider"), + ): + class TestGoogleTranslator(GoogleTranslator): def __init__(self, config_root={}): self.language = "en,ja" self.model_type = "remote.GoogleTranslator" super().__init__(config_root) - + translator_google = TestGoogleTranslator(config_root=config_google) - + assert translator_google.cache is not None assert "en_ja" in str(translator_google.cache.cache_file) - assert "remote.GoogleTranslator" in str(translator_google.cache.cache_file) + assert "remote.GoogleTranslator" in str( + translator_google.cache.cache_file + ) def test_remote_translator_cache_functionality(self, temp_cache_dir): """Test that remote translators can use cache functionality.""" with patch("garak._config.transient.cache_dir", temp_cache_dir): from garak.langproviders.remote import RivaTranslator - + config = { "langproviders": { "riva": { "language": "en,ja", "model_type": "remote.RivaTranslator", - "api_key": "test_key" + "api_key": "test_key", } } } - - with patch.object(RivaTranslator, '_validate_env_var'), patch.object(RivaTranslator, '_load_langprovider'): + + with ( + patch.object(RivaTranslator, "_validate_env_var"), + patch.object(RivaTranslator, "_load_langprovider"), + ): + class TestRivaTranslator(RivaTranslator): def __init__(self, config_root={}): self.language = "en,ja" self.model_type = "remote.RivaTranslator" super().__init__(config_root) - + translator = TestRivaTranslator(config_root=config) - + # Test cache functionality test_text = "Hello world" test_translation = "こんにちは世界" - + # Set cache manually translator.cache.set(test_text, test_translation) - + # Verify cache entry cache_entry = translator.cache.get_cache_entry(test_text) assert cache_entry is not None @@ -255,16 +281,16 @@ def test_cache_with_default_model_name(self, temp_cache_dir): del provider.model_name # 属性自体を削除 cache = TranslationCache(provider) - + # Verify default model_name is used assert cache.model_name == "default" - + # Test cache functionality test_text = "Hello world" test_translation = "こんにちは世界" - + cache.set(test_text, test_translation) - + # Verify cache entry includes default model_name cache_entry = cache.get_cache_entry(test_text) assert cache_entry is not None @@ -281,16 +307,16 @@ def test_cache_with_custom_model_name(self, temp_cache_dir): provider.model_name = "custom_model" cache = TranslationCache(provider) - + # Verify custom model_name is used assert cache.model_name == "custom_model" - + # Test cache functionality test_text = "Hello world" test_translation = "こんにちは世界" - + cache.set(test_text, test_translation) - + # Verify cache entry includes custom model_name cache_entry = cache.get_cache_entry(test_text) assert cache_entry is not None diff --git a/tests/langservice/test_translation_cache_integration.py b/tests/langservice/test_translation_cache_integration.py index 0d50fe7af..68a801161 100644 --- a/tests/langservice/test_translation_cache_integration.py +++ b/tests/langservice/test_translation_cache_integration.py @@ -9,6 +9,7 @@ from garak.langproviders.base import LangProvider, TranslationCache + class TestTranslationCacheIntegration: """Integration test for translation caching functionality.""" @@ -22,43 +23,49 @@ def test_remote_translator_integration(self, temp_cache_dir): """Test that remote translators work correctly in integration scenarios.""" with patch("garak._config.transient.cache_dir", temp_cache_dir): from garak.langproviders.remote import RivaTranslator - + config = { "langproviders": { "riva": { "language": "en,ja", "model_type": "remote.RivaTranslator", - "api_key": "test_key" + "api_key": "test_key", } } } - + # Mock API key validation and create test subclass - with patch.object(RivaTranslator, '_validate_env_var'), patch.object(RivaTranslator, '_load_langprovider'): + with ( + patch.object(RivaTranslator, "_validate_env_var"), + patch.object(RivaTranslator, "_load_langprovider"), + ): + class TestRivaTranslator(RivaTranslator): def __init__(self, config_root={}): self.language = "en,ja" self.model_type = "remote.RivaTranslator" super().__init__(config_root) - + translator = TestRivaTranslator(config_root=config) - + # Test that translator can be instantiated and has cache assert translator.cache is not None assert translator.source_lang == "en" assert translator.target_lang == "ja" - + # Test that cache file path is correctly generated cache_file_path = translator.cache.cache_file assert "en_ja" in str(cache_file_path) assert "remote.RivaTranslator" in str(cache_file_path) assert "default" in str(cache_file_path) # Default model_name - + # Test that translator can handle translation requests (mock) - with patch.object(translator, '_translate_impl', return_value="こんにちは世界"): + with patch.object( + translator, "_translate_impl", return_value="こんにちは世界" + ): result = translator._translate_with_cache("Hello world") assert result == "こんにちは世界" - + # Second call should use cache result2 = translator._translate_with_cache("Hello world") assert result2 == "こんにちは世界" @@ -76,29 +83,33 @@ def __init__(self): self._validate_env_var = lambda: None self._load_langprovider = lambda: None self.cache = TranslationCache(self) + def _translate(self, text): return "" + def _translate_impl(self, text): return "" - + translator = MockLocalProvider() - + # Test that translator can be instantiated and has cache assert translator.cache is not None assert translator.source_lang == "en" assert translator.target_lang == "ja" - + # Test that cache file path is correctly generated cache_file_path = translator.cache.cache_file assert "en_ja" in str(cache_file_path) assert "local" in str(cache_file_path) assert "test_model" in str(cache_file_path) - + # Test that translator can handle translation requests (mock) - with patch.object(translator, '_translate_impl', return_value="こんにちは世界"): + with patch.object( + translator, "_translate_impl", return_value="こんにちは世界" + ): result = translator._translate_with_cache("Hello world") assert result == "こんにちは世界" - + # Second call should use cache result2 = translator._translate_with_cache("Hello world") assert result2 == "こんにちは世界" @@ -106,6 +117,7 @@ def _translate_impl(self, text): def test_cache_persistence_across_sessions(self, temp_cache_dir): """Test that cache persists across different translator sessions (mocked, no Passthru).""" with patch("garak._config.transient.cache_dir", temp_cache_dir): + class MockLocalProvider(LangProvider): def __init__(self): self.language = "en,ja" @@ -115,8 +127,10 @@ def __init__(self): self._validate_env_var = lambda: None self._load_langprovider = lambda: None self.cache = TranslationCache(self) + def _translate(self, text): return "" + # Create first translator instance translator1 = MockLocalProvider() # Set cache entry From 593bc8e9ebed4d1aeb4844915837ebd26554e39d Mon Sep 17 00:00:00 2001 From: Masaya Ogushi Date: Tue, 15 Jul 2025 20:39:01 +0900 Subject: [PATCH 10/11] fix: remove extra test --- tests/langservice/test_translation_cache.py | 52 --------------------- 1 file changed, 52 deletions(-) diff --git a/tests/langservice/test_translation_cache.py b/tests/langservice/test_translation_cache.py index 6a6bcecd3..81dcd12b0 100644 --- a/tests/langservice/test_translation_cache.py +++ b/tests/langservice/test_translation_cache.py @@ -101,31 +101,6 @@ def test_cache_stores_original_text(self, temp_cache_dir): assert cache_entry["model_type"] == "local" assert cache_entry["model_name"] == "test_model" - def test_backward_compatibility(self, temp_cache_dir): - """Test backward compatibility with old cache format.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create mock LangProvider instance - provider = MagicMock() - provider.source_lang = "en" - provider.target_lang = "ja" - provider.model_type = "local" - provider.model_name = "test_model" - - cache = TranslationCache(provider) - - # Simulate old cache format (string values) - cache._cache["old_key"] = "old_translation" - - # Should still work with get method - result = cache.get( - "some_text" - ) # This will return None for non-existent key - assert result is None - - # Should work with get_cache_entry for existing old entries - # Note: This is a bit tricky since we need the original text - # For now, just test that the cache still loads - def test_remote_translator_cache_initialization(self, temp_cache_dir): """Test that remote translators work without __init__ methods.""" with patch("garak._config.transient.cache_dir", temp_cache_dir): @@ -269,33 +244,6 @@ def __init__(self, config_root={}): assert cache_entry["target_lang"] == "ja" assert cache_entry["model_type"] == "remote.RivaTranslator" - def test_cache_with_default_model_name(self, temp_cache_dir): - """Test cache works with default model name when model_name is not set.""" - with patch("garak._config.transient.cache_dir", temp_cache_dir): - # Create mock LangProvider instance without model_name - provider = MagicMock() - provider.source_lang = "en" - provider.target_lang = "ja" - provider.model_type = "local" - provider.model_name = "default_should_be_deleted" - del provider.model_name # 属性自体を削除 - - cache = TranslationCache(provider) - - # Verify default model_name is used - assert cache.model_name == "default" - - # Test cache functionality - test_text = "Hello world" - test_translation = "こんにちは世界" - - cache.set(test_text, test_translation) - - # Verify cache entry includes default model_name - cache_entry = cache.get_cache_entry(test_text) - assert cache_entry is not None - assert cache_entry["model_name"] == "default" - def test_cache_with_custom_model_name(self, temp_cache_dir): """Test cache works with custom model name.""" with patch("garak._config.transient.cache_dir", temp_cache_dir): From 5ba91f61b3081a53a899aed94abf48ce58e58645 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Thu, 24 Jul 2025 15:04:33 -0500 Subject: [PATCH 11/11] suppress cache file for PassThru provider Signed-off-by: Jeffrey Martin --- garak/langproviders/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/garak/langproviders/base.py b/garak/langproviders/base.py index b483e123e..f6587fa0d 100644 --- a/garak/langproviders/base.py +++ b/garak/langproviders/base.py @@ -133,7 +133,7 @@ def is_meaning_string(text: str) -> bool: # To be `Configurable` the root object must meet the standard type search criteria # { langproviders: # "local": { # model_type -# "language": "-" +# "language": "," # "name": "model/name" # model_name # "hf_args": {} # or any other translator specific values for the model_type # } @@ -143,6 +143,9 @@ def is_meaning_string(text: str) -> bool: class TranslationCache: def __init__(self, provider: "LangProvider"): + if not hasattr(provider, "model_type"): + return None # providers without a model_type do not have a cache + self.source_lang = provider.source_lang self.target_lang = provider.target_lang self.model_type = provider.model_type