in obelics/visualization/web_document_and_filtering_visualization.py [0:0]
def load_dataset(self):
st.header("Select the size of the dataset")
self.full_dataset = load_from_disk(self.path_web_documents_dataset)
# Useful the first time we load the full dataset to add a column
# indicating the original IDs of the documents
# self.full_dataset = self.full_dataset.add_column("original_idx", [i for i in range(len(self.full_dataset))])
# self.full_dataset.save_to_disk(self.path_web_documents_dataset)
opt_sizes = ["100", "300", "1000", "3000", "10000"]
size_dataset = st.selectbox(
"Select the size of the dataset",
options=opt_sizes,
)
for opt_size in opt_sizes:
if size_dataset == opt_size:
self.full_dataset = self.full_dataset.select(
[_ for _ in range(min(int(opt_size), self.full_dataset.num_rows))]
)
if "retained_web_document_dataset" not in st.session_state:
st.session_state.retained_web_document_dataset = None
if "discarded_web_document_dataset" not in st.session_state:
st.session_state.discarded_web_document_dataset = None