in microservices/course_ingestion/services/clustering/hierarchical_clustering.py [0:0]
def get_recursive_tree(clustering_model, embeddings, node_level, documents,
doc_ids, text_list, create_learning_units,
create_triples):
if node_level == "course":
clusters = get_optimum_clusters(
clustering_model, embeddings[doc_ids], level="competency")
comps = []
for comp_cluster in set(clusters):
comp = {}
comp["competency"] = comp_cluster
comp["document_ids"] = [
doc_id for doc_id, cluster_id in zip(doc_ids, clusters)
if cluster_id == comp_cluster
]
comp["text"] = join_texts([documents[i] for i in comp["document_ids"]])
comp["title"] = len(text_list)
text_list.append({
"docs": [documents[i] for i in comp["document_ids"]],
"blooms_title": False
})
comp["sub_competencies"], text_list = get_recursive_tree(
clustering_model, embeddings, "competency", documents,
comp["document_ids"], text_list, create_learning_units,
create_triples)
comps.append(comp)
return comps, text_list
elif node_level == "competency":
clusters = get_optimum_clusters(
clustering_model, embeddings[doc_ids], level="sub_competency")
scs = []
for sc_cluster in set(clusters):
sc = {}
sc["sub_competency"] = sc_cluster
sc["document_ids"] = [
doc_id for doc_id, cluster_id in zip(doc_ids, clusters)
if cluster_id == sc_cluster
]
sc["text"] = join_texts([documents[i] for i in sc["document_ids"]])
sc["title"] = len(text_list)
text_list.append({
"docs": [documents[i] for i in sc["document_ids"]],
"blooms_title": False
})
sc["learning_objectives"], text_list = get_recursive_tree(
clustering_model, embeddings, "sub_competency", documents,
sc["document_ids"], text_list, create_learning_units,
create_triples)
scs.append(sc)
return scs, text_list
elif node_level == "sub_competency":
clusters = get_optimum_clusters(
clustering_model, embeddings[doc_ids], level="learning_objective")
los = []
for lo_cluster in set(clusters):
lo = {}
lo["learning_objective"] = lo_cluster
lo["document_ids"] = [
doc_id for doc_id, cluster_id in zip(doc_ids, clusters)
if cluster_id == lo_cluster
]
lo["text"] = "<p>".join([documents[i] for i in lo["document_ids"]])
lo["title"] = len(text_list)
text_list.append({
"docs": [documents[i] for i in lo["document_ids"]],
"blooms_title": True
})
if create_learning_units:
lo["learning_units"], text_list = get_recursive_tree(
clustering_model, embeddings, "learning_objective", documents,
lo["document_ids"], text_list, False, create_triples)
los.append(lo)
return los, text_list
elif node_level == "learning_objective":
clusters = get_optimum_clusters(
clustering_model, embeddings[doc_ids], level="learning_unit")
lus = []
for lu_cluster in set(clusters):
lu = {}
lu["learning_unit"] = lu_cluster
lu["document_ids"] = [
doc_id for doc_id, cluster_id in zip(doc_ids, clusters)
if cluster_id == lu_cluster
]
lu["text"] = "<p>".join([documents[i] for i in lu["document_ids"]])
lu["topics"] = get_topics(lu["text"].replace("<p>", " "))
lu["title"] = len(text_list)
text_list.append({
"docs": [documents[i] for i in lu["document_ids"]],
"blooms_title": True
})
if create_triples:
lu["triples"], text_list = get_recursive_tree(
clustering_model, embeddings, "learning_unit", documents,
lu["document_ids"], text_list, False, create_triples)
lus.append(lu)
return lus, text_list
elif node_level == "learning_unit":
triples = []
lu_text_list = [" ".join([documents[i] for i in doc_ids])]
triples = triple_service.generate_triples(lu_text_list)[0]
return triples, text_list