def get_recursive_tree()

in microservices/course_ingestion/services/clustering/hierarchical_clustering.py [0:0]


def get_recursive_tree(clustering_model, embeddings, node_level, documents,
                       doc_ids, text_list, create_learning_units,
                       create_triples):
  if node_level == "course":
    clusters = get_optimum_clusters(
        clustering_model, embeddings[doc_ids], level="competency")
    comps = []
    for comp_cluster in set(clusters):
      comp = {}
      comp["competency"] = comp_cluster
      comp["document_ids"] = [
          doc_id for doc_id, cluster_id in zip(doc_ids, clusters)
          if cluster_id == comp_cluster
      ]
      comp["text"] = join_texts([documents[i] for i in comp["document_ids"]])
      comp["title"] = len(text_list)
      text_list.append({
          "docs": [documents[i] for i in comp["document_ids"]],
          "blooms_title": False
      })
      comp["sub_competencies"], text_list = get_recursive_tree(
          clustering_model, embeddings, "competency", documents,
          comp["document_ids"], text_list, create_learning_units,
          create_triples)
      comps.append(comp)
    return comps, text_list
  elif node_level == "competency":
    clusters = get_optimum_clusters(
        clustering_model, embeddings[doc_ids], level="sub_competency")
    scs = []
    for sc_cluster in set(clusters):
      sc = {}
      sc["sub_competency"] = sc_cluster
      sc["document_ids"] = [
          doc_id for doc_id, cluster_id in zip(doc_ids, clusters)
          if cluster_id == sc_cluster
      ]
      sc["text"] = join_texts([documents[i] for i in sc["document_ids"]])
      sc["title"] = len(text_list)
      text_list.append({
          "docs": [documents[i] for i in sc["document_ids"]],
          "blooms_title": False
      })
      sc["learning_objectives"], text_list = get_recursive_tree(
          clustering_model, embeddings, "sub_competency", documents,
          sc["document_ids"], text_list, create_learning_units,
          create_triples)
      scs.append(sc)
    return scs, text_list
  elif node_level == "sub_competency":
    clusters = get_optimum_clusters(
        clustering_model, embeddings[doc_ids], level="learning_objective")
    los = []
    for lo_cluster in set(clusters):
      lo = {}
      lo["learning_objective"] = lo_cluster
      lo["document_ids"] = [
          doc_id for doc_id, cluster_id in zip(doc_ids, clusters)
          if cluster_id == lo_cluster
      ]
      lo["text"] = "<p>".join([documents[i] for i in lo["document_ids"]])
      lo["title"] = len(text_list)
      text_list.append({
          "docs": [documents[i] for i in lo["document_ids"]],
          "blooms_title": True
      })
      if create_learning_units:
        lo["learning_units"], text_list = get_recursive_tree(
            clustering_model, embeddings, "learning_objective", documents,
            lo["document_ids"], text_list, False, create_triples)
      los.append(lo)
    return los, text_list
  elif node_level == "learning_objective":
    clusters = get_optimum_clusters(
        clustering_model, embeddings[doc_ids], level="learning_unit")
    lus = []
    for lu_cluster in set(clusters):
      lu = {}
      lu["learning_unit"] = lu_cluster
      lu["document_ids"] = [
          doc_id for doc_id, cluster_id in zip(doc_ids, clusters)
          if cluster_id == lu_cluster
      ]
      lu["text"] = "<p>".join([documents[i] for i in lu["document_ids"]])
      lu["topics"] = get_topics(lu["text"].replace("<p>", " "))
      lu["title"] = len(text_list)
      text_list.append({
          "docs": [documents[i] for i in lu["document_ids"]],
          "blooms_title": True
      })
      if create_triples:
        lu["triples"], text_list = get_recursive_tree(
            clustering_model, embeddings, "learning_unit", documents,
            lu["document_ids"], text_list, False, create_triples)
      lus.append(lu)
    return lus, text_list
  elif node_level == "learning_unit":
    triples = []
    lu_text_list = [" ".join([documents[i] for i in doc_ids])]
    triples = triple_service.generate_triples(lu_text_list)[0]
    return triples, text_list