in src/text_clustering.py [0:0]
def summarize(self, texts, labels):
unique_labels = len(set(labels)) - 1 # exclude the "-1" label
client = InferenceClient(self.summary_model, token=self.summary_model_token)
cluster_summaries = {-1: "None"}
for label in range(unique_labels):
ids = np.random.choice(self.label2docs[label], self.summary_n_examples)
examples = "\n\n".join(
[
f"Example {i+1}:\n{texts[_id][:self.summary_chunk_size]}"
for i, _id in enumerate(ids)
]
)
request = self.summary_template.format(
examples=examples, instruction=self.summary_instruction
)
response = client.text_generation(request)
if label == 0:
print(f"Request:\n{request}")
cluster_summaries[label] = self._postprocess_response(response)
print(f"Number of clusters is {len(cluster_summaries)}")
return cluster_summaries