scripts/extract_task_data.py (57 lines of code) (raw):

""" Example script to demonstrate how to get content of a problem statement for a task from SWEBench datasets. This script shows how to load SWEBench Lite dataset and extract the problem statement for a specific task instance ID (e.g. "django__django-10924"). """ from datasets import load_dataset import json def get_problem_statement(instance_id: str, dataset_name: str = "princeton-nlp/SWE-bench_Lite"): """ Get the problem statement for a specific task instance from SWEBench dataset. Args: instance_id (str): Task instance ID (e.g., "django__django-10924") dataset_name (str): Name of the SWEBench dataset to load Returns: dict: Task instance data including problem statement """ print(f"Loading dataset: {dataset_name}") # Load the dataset dataset = load_dataset(dataset_name, cache_dir="%env.HF_DATASETS_CACHE%") # SWEBench Lite has 'test' split test_data = dataset['test'] print(f"Dataset loaded with {len(test_data)} instances") # Find the specific instance for instance in test_data: if instance['instance_id'] == instance_id: return instance print(f"Instance ID '{instance_id}' not found in dataset") return None def display_problem_info(instance_data): """ Display key information about the problem instance. Args: instance_data (dict): Task instance data """ if not instance_data: print("No instance data to display") return print("\n" + "="*80) print("PROBLEM INSTANCE INFORMATION") print("="*80) print(f"Instance ID: {instance_data['instance_id']}") print(f"Repository: {instance_data['repo']}") print(f"Base Commit: {instance_data['base_commit']}") print(f"Version: {instance_data['version']}") print("\n" + "-"*80) print("PROBLEM STATEMENT:") print("-"*80) print(instance_data['problem_statement']) print("\n" + "-"*80) print("ADDITIONAL FIELDS:") print("-"*80) for key, value in instance_data.items(): if key not in ['instance_id', 'repo', 'base_commit', 'version', 'problem_statement']: if isinstance(value, str) and len(value) > 200: print(f"{key}: {value[:200]}... (truncated)") else: print(f"{key}: {value}") def main(): # Target instance ID from SWEBench Lite target_instance = "%instance_id%" instance_data = get_problem_statement(target_instance) if instance_data: display_problem_info(instance_data) # Save problem statement to Markdown file if instance_data['problem_statement']: output_file = f"{target_instance}_issue.md" with open(output_file, 'w', encoding='utf-8') as f: f.write(instance_data["problem_statement"]) print(f"\nInstance problem statement saved to: {output_file}") if instance_data['hints_text']: output_file = f"{target_instance}_hints.md" with open(output_file, 'w', encoding='utf-8') as f: f.write(instance_data["hints_text"]) print(f"\nInstance problem statement saved to: {output_file}") output_file = f"{target_instance}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(instance_data, f, indent=2, ensure_ascii=False) print(f"\nFull instance data saved to: {output_file}") if __name__ == "__main__": main()