Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions garak/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,13 +611,17 @@ def worker_count_validation(workers):
from garak.resources.autodan import autodan_generate

try:
prompt = _config.probe_options["prompt"]
target = _config.probe_options["target"]
probe_options = parse_cli_plugin_config("probe", args)
if probe_options is None:
raise ValueError("probe_options is None")
prompt = probe_options["prompt"]
target = probe_options["target"]
Comment on lines +614 to +618
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is inconsistent with other cli options, the help details mention prompt_options and the exception below mentions probe_options, this PR needs to expanded to ensure consistent messaging.

This also looks like something of a divergence from how configuration is done generally in the tooling as this is only possible as a cli option as coded.

At a minimum rely on the general configuration object that has already been processed and merged with file based configuration:

Suggested change
probe_options = parse_cli_plugin_config("probe", args)
if probe_options is None:
raise ValueError("probe_options is None")
prompt = probe_options["prompt"]
target = probe_options["target"]
probe_options = config_plugin_type.get("probe", None)
if probe_options is None:
raise ValueError("probe_options is None")
prompt = probe_options["prompt"]
target = probe_options["target"]

As an alternative maybe the generate_autodan option should be remove and the expectation should be adjusted to execute the dan.AutoDAN with exposed DEFAULT_PARAMS for goal_str and target which map to prompt and target here.

except Exception as e:
print(
"AutoDAN generation requires --probe_options with a .json containing a `prompt` and `target` "
"string"
)
raise SystemExit(1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
raise SystemExit(1)
raise

The exception handler should simply re-raise.

autodan_generate(generator=generator, prompt=prompt, target=target)

command.start_run() # start the run now that all config validation is complete
Expand Down
5 changes: 4 additions & 1 deletion garak/resources/autodan/autodan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from garak._plugins import load_plugin
from garak.generators import Generator
from garak.generators.huggingface import Model
from garak.attempt import Conversation, Turn, Message
import garak._config
from garak.data import path as data_path
from garak.resources.autodan.genetic import (
Expand Down Expand Up @@ -178,7 +179,9 @@ def autodan_generate(
best_new_adv_prefix = new_adv_prefixes[best_new_adv_prefix_id]

adv_prefix = best_new_adv_prefix
gen_str = generator.generate(prompt=adv_prefix)[0]
conv = Conversation(turns=[Turn(role="user", content=Message(text=adv_prefix))])
gen_output = generator.generate(prompt=conv)[0]
gen_str = gen_output.text if isinstance(gen_output, Message) else str(gen_output)
success = check_for_attack_success(gen_str, REJECTION_STRINGS)
Comment on lines +184 to 185
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

generator.generate must return a list[Message|None| this mean if this is not Message type it will be NoneType and casting that as a string is not a useful action. Another possible condition is for gen_output.text to be None.

Consider:

Suggested change
gen_str = gen_output.text if isinstance(gen_output, Message) else str(gen_output)
success = check_for_attack_success(gen_str, REJECTION_STRINGS)
success = False
if gen_output and gen_output.text:
success = check_for_attack_success(gen_output.text, REJECTION_STRINGS)

if success:
logger.info(
Expand Down
15 changes: 8 additions & 7 deletions garak/resources/autodan/genetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from garak.resources.api import nltk
from garak.resources.autodan.model_utils import AutoDanPrefixManager, forward
from garak.attempt import Conversation, Turn, Message

logger = getLogger(__name__)

Expand Down Expand Up @@ -278,13 +279,13 @@ def gpt_mutate(mutation_generator, sentence: str) -> str:
while not received:
try:
# TODO: Make the model configurable.
response = mutation_generator.generate(
prompt=[
{"role": "system", "content": system_msg},
{"role": "user", "content": user_message},
]
)
revised_sentence = response[0].replace("\n", "")
conv = Conversation(turns=[
Turn(role="system", content=Message(text=system_msg)),
Turn(role="user", content=Message(text=user_message)),
])
response = mutation_generator.generate(prompt=conv)
response_text = response[0].text if isinstance(response[0], Message) else str(response[0])
revised_sentence = response_text.replace("\n", "")
received = True
Comment on lines +286 to 289
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to other comment:

Suggested change
response = mutation_generator.generate(prompt=conv)
response_text = response[0].text if isinstance(response[0], Message) else str(response[0])
revised_sentence = response_text.replace("\n", "")
received = True
response = mutation_generator.generate(prompt=conv)[0]
if response and response.text:
revised_sentence = response.text.replace("\n", "")
received = True

except Exception as e:
logger.error(e)
Expand Down