in src/text_clustering.py [0:0]
def load(self, folder):
if not os.path.exists(folder):
raise ValueError(f"The folder '{folder}' does not exsit.")
with open(f"{folder}/embeddings.npy", "rb") as f:
self.embeddings = np.load(f)
self.faiss_index = faiss.read_index(f"{folder}/faiss.index")
with open(f"{folder}/projections.npy", "rb") as f:
self.projections = np.load(f)
with open(f"{folder}/cluster_labels.npy", "rb") as f:
self.cluster_labels = np.load(f)
with open(f"{folder}/texts.json", "r") as f:
self.texts = json.load(f)
if os.path.exists(f"{folder}/cluster_summaries.json"):
with open(f"{folder}/cluster_summaries.json", "r") as f:
self.cluster_summaries = json.load(f)
keys = list(self.cluster_summaries.keys())
for key in keys:
self.cluster_summaries[int(key)] = self.cluster_summaries.pop(key)
# those objects can be inferred and don't need to be saved/loaded
self.id2cluster = {
index: label for index, label in enumerate(self.cluster_labels)
}
self.label2docs = defaultdict(list)
for i, label in enumerate(self.cluster_labels):
self.label2docs[label].append(i)
self.cluster_centers = {}
for label in self.label2docs.keys():
x = np.mean([self.projections[doc, 0] for doc in self.label2docs[label]])
y = np.mean([self.projections[doc, 1] for doc in self.label2docs[label]])
self.cluster_centers[label] = (x, y)