Skip to content
35 changes: 35 additions & 0 deletions docs/source/translation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,41 @@ Limitations
- If probes or detectors fail to load, you need may need to choose a smaller local translation model or utilize a remote service.
- Translation may add significant execution time to the run depending on resources available.

Translation Caching
------------------

Garak implements a translation caching system to improve performance and reduce API costs when using translation services. The caching mechanism automatically stores and retrieves translation results to avoid redundant API calls.

**How it works:**

- Each translation pair (source language → target language) gets its own cache file
- Cache files are stored in JSON format under the cache directory: ``{cache_dir}/translation/translation_cache_{source_lang}_{target_lang}_{model_type}_{model_name}.json``
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do model_type and model_name refer to the translator model? what do API translates look like?

- Translation results are keyed by MD5 hash of the input text for efficient storage and retrieval
- Cache files persist between runs, allowing translations to be reused across multiple garak sessions

**Benefits:**

- **Performance**: Significantly reduces translation time for repeated text
- **Cost savings**: Reduces API calls to paid services like DeepL, Google Cloud Translation, and NVIDIA Riva
- **Reliability**: Provides fallback for offline scenarios when cached translations are available
- **Consistency**: Ensures identical translations for the same input text across different runs

**Cache management:**

- Cache files are automatically created when translations are performed
- Corrupted cache files are handled gracefully with fallback to empty cache
- Cache files can be manually deleted to force fresh translations
- Cache directory location follows garak's standard cache configuration

**Supported for all translation services:**

- Local translation models (Hugging Face)
- DeepL API
- NVIDIA Riva API
- Google Cloud Translation API

The caching system is transparent to users and requires no additional configuration. It automatically activates when translation services are used.

Supported translation services
------------------------------

Expand Down
104 changes: 101 additions & 3 deletions garak/langproviders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@
import unicodedata
import string
import logging
import json
import hashlib
import os
from pathlib import Path
from garak.resources.api import nltk
from langdetect import detect, DetectorFactory, LangDetectException
from garak import _config

_intialized_words = False

Expand Down Expand Up @@ -128,14 +133,94 @@ def is_meaning_string(text: str) -> bool:
# To be `Configurable` the root object must meet the standard type search criteria
# { langproviders:
# "local": { # model_type
# "language": "<from>-<to>"
# "language": "<from>,<to>"
# "name": "model/name" # model_name
# "hf_args": {} # or any other translator specific values for the model_type
# }
# }
from garak.configurable import Configurable


class TranslationCache:
def __init__(self, provider: "LangProvider"):
if not hasattr(provider, "model_type"):
return None # providers without a model_type do not have a cache
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why? is this expressing a non-explicit convention in LangProvider naming? if so would prefer this to be expressed explicitly

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is checking an internal detail of the implemented object, we could add some other defining factor, however at this time only the PassThru class meets such criteria.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add some other defining factor. I can open an issue or we can do it here. What would you prefer? A class attrib?


self.source_lang = provider.source_lang
self.target_lang = provider.target_lang
self.model_type = provider.model_type
self.model_name = "default"
if hasattr(provider, "model_name"):
self.model_name = provider.model_name

cache_dir = _config.transient.cache_dir / "translation"
cache_dir.mkdir(mode=0o740, parents=True, exist_ok=True)
cache_filename = f"translation_cache_{self.source_lang}_{self.target_lang}_{self.model_type}_{self.model_name.replace('/', '_')}.json"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefer dbm backend for this vs. a large inflight dict with occasional serdes (i guess that word kinda fits here?) action. Can be a fast follow

self.cache_file = cache_dir / cache_filename
logging.info(f"Cache file: {self.cache_file}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

needs verb + qualification

Suggested change
logging.info(f"Cache file: {self.cache_file}")
logging.info(f"Loading translation cache file: {self.cache_file}")

self._cache = self._load_cache()

def _load_cache(self) -> dict:
if self.cache_file.exists():
try:
with open(self.cache_file, "r", encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, IOError) as e:
logging.warning(f"Failed to load translation cache: {e}")
return {}
return {}

def _save_cache(self):
try:
with open(self.cache_file, "w", encoding="utf-8") as f:
json.dump(self._cache, f, ensure_ascii=False, indent=2)
except IOError as e:
logging.warning(f"Failed to save translation cache: {e}")

def get_cache_key(self, text: str) -> str:
return hashlib.md5(text.encode("utf-8"), usedforsecurity=False).hexdigest()
Comment on lines +180 to +181
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like potentially a lot of overhead. Approach is, IMO, fine for now but perhaps we should consider partial hashing or something? Given potentially many thousands of strings that are often very long, we could consider a faster cache lookup approach in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most case is short sentence. If we support long sentence, we consider about "partial hashing" in the future.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought this and measured hashlib.md5 speed. It seems pretty fine

Alternative is to use the raw string as key and rely on the underlying index's hashing (e.g. dict keys). This halves the amount of hashing. Might bump into length limits though ¯_ (ツ)_/¯


def get(self, text: str) -> str | None:
cache_key = self.get_cache_key(text)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
cache_key = self.get_cache_key(text)
if not len(self._cache):
return None
cache_key = self.get_cache_key(text)

Would save us a ton of hashing. At some point in the future, we could even add hierarchy and do it by probe or something so that we simply avoid generating a ton of hashes to look up in an empty cache for no reason.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current implementation is not caching full prompts or responses but each sentence, I do think in a future revision we will see some optimization to add here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for skipping on empty cache

cache_entry = self._cache.get(cache_key)
Comment on lines +183 to +185
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're going to have a ton of cache misses on first invocation, does it make sense for us to have an easier way to avoid cache misses? Maybe just a check if the cache is empty?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cache is built in real-time and many of the probes result in hits based on responses very quickly in the first run. These checks that result in miss even in the first run seem worth the cost IMO.

if cache_entry and isinstance(cache_entry, dict):
return cache_entry.get("translation")
elif isinstance(cache_entry, str):
# Backward compatibility with old format
return cache_entry
return None
Comment on lines +186 to +191
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is backward compat for "old format" present in a PR introducing a new feature for the first time?

I'm not quite happy with the cache format having dict entries - this is overkill.

If we're specifying in the cache key (i.e. filename) the following:

  • src lang code
  • dst lang code
  • langprovider

Then we don't need anything more than a direct key/value store here, where key is src language and value is dst language.

I guess the database engine in this PR is an inflight dict that gets JSON serialised/deserialised. This can ditch hashing and use the src text as lookup value - that'll ease troubleshooting and reduce hashing.

An alternative is a dbm engine with the same src/dst key value

Is there a requirement for any more info than this? We have create/modify/access stamps as the database file object metadata. I don't think we have size constraints so timestamping at the level of individual cache entries seems overkill. I guess if we want a stable format we might set the value as a dict where one value is the text to be retrieved, but this seems overkill.

Happy to include tooling for migrating DBs between formats, providing external routes for people to move to the current garak setup like we do with fixers, but I don't want to support multiple cache format readers as core functionality.


def set(self, text: str, translation: str):
cache_key = self.get_cache_key(text)
self._cache[cache_key] = {
Comment on lines +194 to +195
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just use source text itself as the key

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this be guaranteed to be consistently viable? Using a hash provides a known consistent format that will also be cross platform compatible (as no OS specific escaping of the alphanumeric hash strings would apply). While in theory all python versions should use consistent internal representations of a strings that assumption is not a guaranteed and lines in a prompt could contain something odd, in fact the this is a core expectation as we test the edges of things.

Copy link
Collaborator

@leondz leondz Sep 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's as consistently viable as the underlying object. Python dict requires that keys are hashable. Python str is a hashable type. Max str length is predicated on arch; for 32-bit it's ~2.1GB.

So yes, consistently viable. Not universally viable but we will have Other Problems (and many of them) before it breaks.

Python already hashes the keys we're using (strings) and does all the hard work putting them into hashed lookups (dicts). No need to duplicate that work.

"original": text,
"translation": translation,
"source_lang": self.source_lang,
"target_lang": self.target_lang,
"model_type": self.model_type,
"model_name": self.model_name,
Comment on lines +198 to +201
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this info is redundant - it's logged in the cache object's filename - suggest it is deleted

}
self._save_cache()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this means a Lot of serialisation and saving!


def get_cache_entry(self, text: str) -> dict | None:
"""Get full cache entry including original text and metadata."""
cache_key = self.get_cache_key(text)
cache_entry = self._cache.get(cache_key)
if cache_entry and isinstance(cache_entry, dict):
return cache_entry
elif isinstance(cache_entry, str):
# Backward compatibility with old format
return {
"original": text,
"translation": cache_entry,
"source_lang": self.source_lang,
"target_lang": self.target_lang,
"model_type": self.model_type,
"model_name": self.model_name,
}
return None
Comment on lines +205 to +221
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's not have backward compatibility in the first PR landing a feature

Let's simplify the cache entires by removing information we have elsewhere

Let's index on the text directly, halving the amount of hashing done

Suggested change
def get_cache_entry(self, text: str) -> dict | None:
"""Get full cache entry including original text and metadata."""
cache_key = self.get_cache_key(text)
cache_entry = self._cache.get(cache_key)
if cache_entry and isinstance(cache_entry, dict):
return cache_entry
elif isinstance(cache_entry, str):
# Backward compatibility with old format
return {
"original": text,
"translation": cache_entry,
"source_lang": self.source_lang,
"target_lang": self.target_lang,
"model_type": self.model_type,
"model_name": self.model_name,
}
return None
def get_cache_entry(self, text: str) -> str | None:
return self._cache.get(text)



class LangProvider(Configurable):
"""Base class for objects that provision language"""

Expand All @@ -147,6 +232,9 @@ def __init__(self, config_root: dict = {}) -> None:

self._validate_env_var()

# Use TranslationCache for caching
self.cache = TranslationCache(self)

self._load_langprovider()

def _load_langprovider(self):
Expand All @@ -155,6 +243,16 @@ def _load_langprovider(self):
def _translate(self, text: str) -> str:
raise NotImplementedError

def _translate_with_cache(self, text: str) -> str:
"""Translate text with caching support."""
cached_translation = self.cache.get(text)
if cached_translation is not None:
logging.debug(f"Using cached translation for text: {text[:50]}...")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not log the original text, these values can source from prompts or LLM response and could produce escape sequences that could impact something that is monitoring the logs.

Suggested change
logging.debug(f"Using cached translation for text: {text[:50]}...")

I could see however keeping a tracker in the service for cache hit/miss rates. While not something we need to expose at this time having the metric values in the runtime could be helpful during debugging scenarios.

return cached_translation
translation = self._translate_impl(text)
self.cache.set(text, translation)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we move to a direct src->dst cache? i.e.

Suggested change
self.cache.set(text, translation)
self.cache.[text] = translation

return translation

def _get_response(self, input_text: str):
translated_lines = []

Expand Down Expand Up @@ -189,7 +287,7 @@ def _short_sentence_translate(self, line: str) -> str:
if needs_translation:
cleaned_line = self._clean_line(line)
if cleaned_line:
translated_line = self._translate(cleaned_line)
translated_line = self._translate_with_cache(cleaned_line)
translated_lines.append(translated_line)

return translated_lines
Expand All @@ -202,7 +300,7 @@ def _long_sentence_translate(self, line: str) -> str:
if self._should_skip_line(cleaned_sentence):
translated_lines.append(cleaned_sentence)
continue
translated_line = self._translate(cleaned_sentence)
translated_line = self._translate_with_cache(cleaned_sentence)
translated_lines.append(translated_line)

return translated_lines
Expand Down
6 changes: 6 additions & 0 deletions garak/langproviders/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def _load_langprovider(self):
pass

def _translate(self, text: str) -> str:
# Use _translate_with_cache to enable caching
return text

def get_text(
Expand Down Expand Up @@ -110,6 +111,11 @@ def _load_langprovider(self):
self.tokenizer = MarianTokenizer.from_pretrained(model_name)

def _translate(self, text: str) -> str:
# Use _translate_with_cache to enable caching
return self._translate_with_cache(text)

def _translate_impl(self, text: str) -> str:
"""Actual translation implementation without caching."""
if "m2m100" in self.model_name:
self.tokenizer.src_lang = self.source_lang

Expand Down
15 changes: 15 additions & 0 deletions garak/langproviders/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def _load_langprovider(self):

# TODO: consider adding a backoff here and determining if a connection needs to be re-established
def _translate(self, text: str) -> str:
# Use _translate_with_cache to enable caching
return self._translate_with_cache(text)

def _translate_impl(self, text: str) -> str:
"""Actual translation implementation without caching."""
try:
if self.client is None:
self._load_langprovider()
Expand Down Expand Up @@ -152,6 +157,11 @@ def _load_langprovider(self):
self._tested = True

def _translate(self, text: str) -> str:
# Use _translate_with_cache to enable caching
return self._translate_with_cache(text)

def _translate_impl(self, text: str) -> str:
"""Actual translation implementation without caching."""
try:
return self.client.translate_text(
text, source_lang=self._source_lang, target_lang=self._target_lang
Expand Down Expand Up @@ -230,6 +240,11 @@ def _load_langprovider(self):
self._tested = True

def _translate(self, text: str) -> str:
# Use _translate_with_cache to enable caching
return self._translate_with_cache(text)

def _translate_impl(self, text: str) -> str:
"""Actual translation implementation without caching."""
retry = 5
while retry > 0:
try:
Expand Down
Loading
Loading