pyrit/prompt_converter/fuzzer_converter/fuzzer_converter_base.py (70 lines of code) (raw):

# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import json import logging import uuid from typing import Optional from pyrit.exceptions import ( InvalidJsonException, pyrit_json_retry, remove_markdown_json, ) from pyrit.models import ( PromptDataType, PromptRequestPiece, PromptRequestResponse, SeedPrompt, ) from pyrit.prompt_converter import ConverterResult, PromptConverter from pyrit.prompt_target import PromptChatTarget logger = logging.getLogger(__name__) class FuzzerConverter(PromptConverter): """ Base class for GPTFUZZER converters. Adapted from GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts. Paper https://arxiv.org/pdf/2309.10253 by Jiahao Yu, Xingwei Lin, Zheng Yu, Xinyu Xing GitHub https://github.com/sherdencooper/GPTFuzz/tree/master Parameters: converter_target (PromptChatTarget): Chat target used to perform fuzzing on user prompt prompt_template (SeedPrompt): Template to be used instead of the default system prompt with instructions for the chat target. """ def __init__(self, *, converter_target: PromptChatTarget, prompt_template: Optional[SeedPrompt] = None): self.converter_target = converter_target self.system_prompt = prompt_template.value self.template_label = "TEMPLATE" def update(self, **kwargs) -> None: pass async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: """ Converter to generate versions of prompt with new, prepended sentences. Args: prompt (str): The prompt to be converted. input_type (PromptDataType): The type of the input prompt. """ if not self.input_supported(input_type): raise ValueError("Input type not supported") conversation_id = str(uuid.uuid4()) self.converter_target.set_system_prompt( system_prompt=self.system_prompt, conversation_id=conversation_id, orchestrator_identifier=None, ) formatted_prompt = f"===={self.template_label} BEGINS====\n{prompt}\n===={self.template_label} ENDS====" prompt_metadata: dict[str, str | int] = {"response_format": "json"} request = PromptRequestResponse( [ PromptRequestPiece( role="user", original_value=formatted_prompt, converted_value=formatted_prompt, conversation_id=conversation_id, sequence=1, prompt_target_identifier=self.converter_target.get_identifier(), original_value_data_type=input_type, converted_value_data_type=input_type, converter_identifiers=[self.get_identifier()], prompt_metadata=prompt_metadata, ) ] ) response = await self.send_prompt_async(request) return ConverterResult(output_text=response, output_type="text") @pyrit_json_retry async def send_prompt_async(self, request): response = await self.converter_target.send_prompt_async(prompt_request=request) response_msg = response.get_value() response_msg = remove_markdown_json(response_msg) try: parsed_response = json.loads(response_msg) if "output" not in parsed_response: raise InvalidJsonException(message=f"Invalid JSON encountered; missing 'output' key: {response_msg}") return parsed_response["output"] except json.JSONDecodeError: raise InvalidJsonException(message=f"Invalid JSON encountered: {response_msg}") def input_supported(self, input_type: PromptDataType) -> bool: return input_type == "text" def output_supported(self, output_type: PromptDataType) -> bool: return output_type == "text"