def extract_triples_by_regex_with_schema()

in hugegraph-llm/src/hugegraph_llm/operators/llm_op/info_extract.py [0:0]


def extract_triples_by_regex_with_schema(schema, text, graph):
    text = text.replace("\\n", " ").replace("\\", " ").replace("\n", " ")
    pattern = r"\((.*?), (.*?), (.*?)\) - ([^ ]*)"
    matches = re.findall(pattern, text)

    vertices_dict = {v["id"]: v for v in graph["vertices"]}
    for match in matches:
        s, p, o, label = [item.strip() for item in match]
        if None in [label, s, p, o]:
            continue
        # TODO: use a more efficient way to compare the extract & input property
        p_lower = p.lower()
        for vertex in schema["vertices"]:
            if vertex["vertex_label"] == label and any(pp.lower() == p_lower
                                                       for pp in vertex["properties"]):
                id = f"{label}-{s}"
                if id not in vertices_dict:
                    vertices_dict[id] = {"id": id, "name": s, "label": label, "properties": {p: o}}
                else:
                    vertices_dict[id]["properties"].update({p: o})
                break
        for edge in schema["edges"]:
            if edge["edge_label"] == label:
                source_label = edge["source_vertex_label"]
                source_id = f"{source_label}-{s}"
                if source_id not in vertices_dict:
                    vertices_dict[source_id] = {"id": source_id, "name": s, "label": source_label,
                                                "properties": {}}
                target_label = edge["target_vertex_label"]
                target_id = f"{target_label}-{o}"
                if target_id not in vertices_dict:
                    vertices_dict[target_id] = {"id": target_id, "name": o, "label": target_label,
                                                "properties": {}}
                graph["edges"].append({"start": source_id, "end": target_id, "type": label,
                                       "properties": {}})
                break
    graph["vertices"] = vertices_dict.values()