in src/lighteval/tasks/extended/tiny_benchmarks/main.py [0:0]
def aggregate(self, y_input):
if len(y_input) == self.num_samples and self.estimates is not None:
return self.estimates[self.task]
# We load the weights for the relevant examples
with open("extended_tasks/tiny_benchmarks/tinyBenchmarks.pkl", "rb") as handle:
tinyBenchmarks = pickle.load(handle)
seen_examples = tinyBenchmarks[self.scenario]["seen_examples"]
examples_weights = tinyBenchmarks[self.scenario]["examples_weights"]
irt_parameters = tinyBenchmarks[self.scenario]["irt_parameters"]
A, B = irt_parameters["A"], irt_parameters["B"]
optimal_lambdas = tinyBenchmarks[self.scenario]["optimal_lambdas"]
scenarios_position = tinyBenchmarks[self.scenario]["scenarios_position"]
subscenarios_position = tinyBenchmarks[self.scenario]["subscenarios_position"]
N = np.max([np.max(x) for x in scenarios_position.values()]) + 1
balance_weights = np.ones(N)
for scenario in scenarios_position.keys():
N_sce = len(scenarios_position[scenario])
n_sub = len(subscenarios_position[scenario])
for sub in subscenarios_position[scenario].keys():
n_i = len(subscenarios_position[scenario][sub])
balance_weights[subscenarios_position[scenario][sub]] = N_sce / (n_sub * n_i)
# In case we use the big IRT model to estimate the performance of individual scenarios
if self.task not in self.BENCHS:
scenarios = [self.task]
ind_scenario = (
self.number_of_examples * ([i for i, s in enumerate(scenarios_position.keys()) if s == self.task][0])
)
seen_examples = seen_examples[ind_scenario : ind_scenario + self.number_of_examples]
else:
scenarios = list(scenarios_position.keys())
# Creating vector y and estimating theta
y = np.zeros(N)
for i, j in enumerate(seen_examples):
y[j] = y_input[i]
# Getting estimates
theta = fit_theta(y, seen_examples, A, B)
estimates = {}
unseen_examples = [i for i in range(N) if i not in seen_examples]
for scenario in scenarios:
N_sce = len(scenarios_position[scenario])
seen_examples_sce = [s for s in seen_examples if s in scenarios_position[scenario]]
unseen_examples_sce = [s for s in unseen_examples if s in scenarios_position[scenario]]
data_part_IRTp = ((balance_weights * y)[seen_examples_sce]).mean()
irt_part = (balance_weights * item_curve(theta.reshape(1, A.shape[1], 1), A, B))[
0, [unseen_examples_sce]
].mean()
IRTp_lambd = self.number_of_examples / N_sce
IRT = (examples_weights[scenario] * y[seen_examples_sce]).sum()
IRTp = IRTp_lambd * data_part_IRTp + (1 - IRTp_lambd) * irt_part
IRTpp = optimal_lambdas[scenario] * IRT + (1 - optimal_lambdas[scenario]) * IRTp
estimates[scenario] = {}
estimates[scenario]["irt"] = IRT
estimates[scenario]["pirt"] = IRTp
estimates[scenario]["gpirt"] = IRTpp
self.num_samples = len(y_input)
self.estimates = estimates
return estimates[self.task]