in obelics/visualization/choose_filtering_parameters_web_documents_node_level.py [0:0]
def get_exs_and_stats(web_document_dataset, type_exs, funcs_compute_stats, text_node_level=True):
exs = []
for idx_row in range(web_document_dataset.num_rows):
new_els = non_empty_els_from_list(web_document_dataset[idx_row][type_exs])
if type_exs == "texts":
new_els = non_empty_els_from_list(web_document_dataset[idx_row][type_exs])
if not text_node_level: # Text at document level
exs.append("\n\n".join(new_els))
else: # Text at paragraph level
new_els = [txt.split("\n\n") for txt in new_els]
new_els = [paragraph for txt in new_els for paragraph in txt]
exs.extend(new_els)
else:
exs.extend(new_els)
all_stats = {}
all_stats["exs"] = exs
for stat_name, func_compute_stats in funcs_compute_stats.items():
all_stats[stat_name] = [round(func_compute_stats(ex), 2) for ex in exs]
return all_stats