pyrit/datasets/multilingual_vulnerability_dataset.py (24 lines of code) (raw):

# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import pandas as pd from pyrit.models import SeedPromptDataset from pyrit.models.seed_prompt import SeedPrompt def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset: """ Fetch multilingual vulnerability examples from "A Framework to Assess Multilingual Vulnerabilities of LLMs" and create a SeedPromptDataset. Returns: SeedPromptDataset: A SeedPromptDataset containing the examples. """ url = "https://raw.githubusercontent.com/CarsonDon/Multilingual-Vuln-LLMs/main/prompts/allprompt.csv" df = pd.read_csv(url) seed_prompts = [ SeedPrompt( value=row["en"], data_type="text", name=str(row["id"]), dataset_name="Multilingual-Vulnerability", harm_categories=[row["type"]], description="Dataset from 'A Framework to Assess Multilingual Vulnerabilities of LLMs'. " "Multilingual prompts demonstrating LLM vulnerabilities, labeled by type. " "Paper: https://arxiv.org/pdf/2503.13081", authors="Likai Tang, Niruth Bogahawatta, Yasod Ginige, " "Jiarui Xu, Shixuan Sun, Surangika Ranathunga, Suranga Seneviratne", source="https://github.com/CarsonDon/Multilingual-Vuln-LLMs", ) for _, row in df.iterrows() ] seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts) return seed_prompt_dataset