def breakdown_datasets()

in evalbench/dataset/dataset.py [0:0]


def breakdown_datasets(total_dataset: list[EvalInputRequest]):
    """
    The shape of the output will be dict[str, dict[str, list[EvalInputRequest]]]
    in the following format:
    {
      dialect (str):
      -> database (str):
          -> query_type (str; [dql,dml,ddl]):
              -> list[EvalInputRequest]
    }
    """
    total_dataset_len = 0
    total_db_len = 0
    datasets: dict[str, dict[str, dict[str, list[EvalInputRequest]]]] = {}
    for input in total_dataset:
        for dialect in input.dialects:
            if dialect not in datasets:
                datasets[dialect] = {}
            if input.database not in datasets[dialect]:
                datasets[dialect][input.database] = {}
            if input.query_type not in datasets[dialect][input.database]:
                datasets[dialect][input.database][input.query_type] = []
                total_db_len += 1
            datasets[dialect][input.database][input.query_type].append(
                input.copy_for_dialect(dialect)
            )
            total_dataset_len += 1
    return datasets, total_dataset_len, total_db_len