in scripts/extract_task_data.py [0:0]
def get_problem_statement(instance_id: str, dataset_name: str = "princeton-nlp/SWE-bench_Lite"):
"""
Get the problem statement for a specific task instance from SWEBench dataset.
Args:
instance_id (str): Task instance ID (e.g., "django__django-10924")
dataset_name (str): Name of the SWEBench dataset to load
Returns:
dict: Task instance data including problem statement
"""
print(f"Loading dataset: {dataset_name}")
# Load the dataset
dataset = load_dataset(dataset_name, cache_dir="%env.HF_DATASETS_CACHE%")
# SWEBench Lite has 'test' split
test_data = dataset['test']
print(f"Dataset loaded with {len(test_data)} instances")
# Find the specific instance
for instance in test_data:
if instance['instance_id'] == instance_id:
return instance
print(f"Instance ID '{instance_id}' not found in dataset")
return None