in pyrit/auxiliary_attacks/gcg/experiments/run.py [0:0]
def run_trainer(*, model_name: str, setup: str = "single", **extra_config_parameters):
"""
Trains and generates adversarial suffix - single model single prompt
Args:
model_name (str): The name of the model, currently supports:
"mistral", "llama_2", "llama_3", "vicuna", "phi_3_mini", "all_models"
setup (str): Identifier for the setup, currently supporst
- "single": one prompt one model
- "multiple": multiple prompts one model or multiple prompts multiple models
"""
if model_name not in MODEL_NAMES:
raise ValueError(
"Model name not supported. Currently supports 'mistral', 'llama_2', 'llama_3', 'vicuna', and 'phi_3_mini'"
)
_load_environment_files()
hf_token = os.environ.get("HUGGINGFACE_TOKEN")
if not hf_token:
raise ValueError("Please set the HUGGINGFACE_TOKEN environment variable")
runtime_config: Dict[str, Union[str, bool, Any]] = {
"train_data": (
"https://raw.githubusercontent.com/llm-attacks/llm-attacks/main/data/advbench/harmful_behaviors.csv"
),
"result_prefix": f"results/individual_behaviors_{model_name}_gcg",
"token": hf_token,
}
if setup != "single":
runtime_config["progressive_goals"] = True
runtime_config["stop_on_success"] = True
config_name = "transfer"
else:
config_name = "individual"
config = _load_yaml_to_dict(f"configs/{config_name}_{model_name}.yaml")
config.update(runtime_config)
config.update(extra_config_parameters)
config["model_name"] = model_name
trainer = GreedyCoordinateGradientAdversarialSuffixGenerator()
if not os.path.exists("results"):
os.makedirs("results")
trainer.generate_suffix(**config)