in jupyter/comparison-to-datasketch/cardinality_error_experiment.py [0:0]
def run(self):
"""Runs the experiment"""
seq_start = np.uint64(2345234)
distinct_number = np.uint64(3462)
previous_log_trial_index = 0
ds_all_results = np.zeros((self.num_trials, len(self.plot_points)))
d_all_results = np.zeros_like(ds_all_results)
for trial in range(1, self.num_trials+1):
#print(f"Trial = {trial}\t{self._is_power_of_two(trial)}")
# Initialise the sketches
hll = ds.hll_sketch(self.sketch_lgk, ds.HLL_8)
h = d.HyperLogLogPlusPlus(p=self.sketch_lgk, hashfunc=lambda x: mmh3.hash64(x, signed=False)[0])
plot_point_index = 0 # Return to the start of the plot points list to generate the data
plot_point_value = self.plot_points[plot_point_index]
total_updates = 0
seq_start += distinct_number # Start a new input sequence
# Temporary result data structure
ds_results = np.zeros((len(self.plot_points),))
d_results = np.zeros_like(ds_results)
for new_number in distinct_number_sequence(seq_start):
hll.update(new_number)
h.update(str(new_number).encode('utf8'))
total_updates += 1
#print(f"Trial: {trial:<5} updates: {total_updates:<5}Index: {plot_point_index:<5} Value: {plot_point_value:<5}\n")
if total_updates == plot_point_value:
ds_results[plot_point_index] = (hll.get_estimate() - plot_point_value) / plot_point_value
d_results[plot_point_index] = (h.count() - plot_point_value) / plot_point_value
plot_point_index += 1
#print(f"PPI:{plot_point_index:<3}PPV:{plot_point_value}")
if plot_point_index < len(self.plot_points):
plot_point_value = self.plot_points[plot_point_index]
else:
#print("Already at the end")
break
# After the break statement, control returns here. Now need to decide whether to write or continue.
ds_all_results[trial-1, :] = ds_results # subtract 1 as we use 1-based indexing for the trial count.
d_all_results[trial - 1, :] = d_results # subtract 1 as we use 1-based indexing for the trial count.
if self._is_power_of_two(trial) and trial > 1:
# write the array only a logarithmic number of times
temporary_ds_results = ds_all_results[0:trial, : ]
temporary_d_results = d_all_results[0:trial, :]
print(f"#################### PARTIAL RESULTS FOR {trial} TRIALS: DATASKETCHES ####################")
previous_log_trial_index = trial
self.DataSketches_results_df = pd.DataFrame(temporary_ds_results.T, index=self.DataSketches_results_df.index, columns=np.arange(trial).tolist())
self.DataSketches_results_df.to_csv(
self.directory_name + "/DataSketches_hll" + self.file_extension + f"trials_{trial}.csv",
index_label="n")
self.datasketch_results_df = pd.DataFrame(temporary_d_results.T,
index=self.datasketch_results_df.index,
columns=np.arange(trial).tolist())
self.datasketch_results_df.to_csv(
self.directory_name + "/datasketch_hll" + self.file_extension + f"trials_{trial}.csv",
index_label="n"
)
print(self.DataSketches_results_df)
print(f"#################### PARTIAL RESULTS FOR {trial} TRIALS: datasketch ####################")
print(self.datasketch_results_df)