def get_problem_statement()

in scripts/extract_task_data.py [0:0]


def get_problem_statement(instance_id: str, dataset_name: str = "princeton-nlp/SWE-bench_Lite"):
    """
    Get the problem statement for a specific task instance from SWEBench dataset.

    Args:
    instance_id (str): Task instance ID (e.g., "django__django-10924")
    dataset_name (str): Name of the SWEBench dataset to load

    Returns:
    dict: Task instance data including problem statement
    """
    print(f"Loading dataset: {dataset_name}")

    # Load the dataset
    dataset = load_dataset(dataset_name, cache_dir="%env.HF_DATASETS_CACHE%")

    # SWEBench Lite has 'test' split
    test_data = dataset['test']

    print(f"Dataset loaded with {len(test_data)} instances")

    # Find the specific instance
    for instance in test_data:
        if instance['instance_id'] == instance_id:
            return instance

    print(f"Instance ID '{instance_id}' not found in dataset")
    return None