in gpqa_eval.py [0:0]
def __init__( self, n_repeats: int = 4, variant: str = "diamond", num_examples: int | None = None, # restrict to a subset of the data for debugging