in src/graph_construction.py [0:0]
def construct_graph(docs, coref_predictor, oie_predictor):
graph = nx.Graph()
node_names_list = []
node_vec_list = []
node_counter = 0
# Get Coref:
tokenized_docs = []
coref_info_list = []
start_time = time.time()
for ind, d in enumerate(docs):
dtokens, cinfo = extract_coref(coref_predictor, d)
tokenized_docs.append(" ".join(dtokens))
coref_info_list.append(cinfo)
#print(f"Time for coref extraction for all documents: {time.time()-start_time}")
tfidfs_list, word_index_mapping, total_num_words = compute_base_features(tokenized_docs)
generic_tfidfs = {} # This is to handle errors when sub_name is not in current docs tfidfs
for tfidf in tfidfs_list:
for k,v in tfidf.items():
if generic_tfidfs.get(k) is None:
generic_tfidfs[k] = v
else:
if v>generic_tfidfs[k]:
generic_tfidfs[k] = v
#print(generic_tfidfs)
oies_list = []
directed_edges = {}
for ind, d in enumerate(docs):
start_time = time.time()
# Get OIE:
oies = extract_oie(oie_predictor, d)
#print(f"Time for extracting oie for doc-{ind} is {time.time()-start_time}")
coref_info = coref_info_list[ind]
tfidfs = tfidfs_list[ind]
#print(tfidfs)
oies_list.append(oies)
for x in oies:
#print(x)
# Node for subject / ARG0
uid = "{}-{}".format(x["ARG0"]["index"][0], x["ARG0"]["index"][1])
if coref_info.get(uid) is not None:
sub_name = coref_info[uid]["unique"]
else:
sub_name = x["ARG0"]["span"]
# Add vector
sub_name_vector = np.zeros(total_num_words)
for word in sub_name.split():
try:
sub_name_vector[word_index_mapping[word]]= tfidfs[word] if tfidfs.get(word) is not None else generic_tfidfs[word]
except:
print(f"Exception Occurred for word: {word}")
sim_node_ind = similar_match(sub_name, node_names_list, sub_name_vector, node_vec_list)
if sim_node_ind == -1:
graph.add_node(node_counter, text=sub_name, weight=1)
sub_index = node_counter
node_counter += 1
node_names_list.append(sub_name)
node_vec_list.append(sub_name_vector)
else:
sub_index = sim_node_ind
graph.nodes[sub_index]["weight"] += 1
# Node for Object / ARG1
uid = "{}-{}".format(x["ARG1"]["index"][0], x["ARG1"]["index"][1])
if coref_info.get(uid) is not None:
obj_name = coref_info[uid]["unique"]
else:
obj_name = x["ARG1"]["span"]
# Add vector
obj_name_vector = np.zeros(total_num_words)
for word in obj_name.split():
try:
obj_name_vector[word_index_mapping[word]]=tfidfs[word] if tfidfs.get(word) is not None else generic_tfidfs[word]
except:
print(f"Exception Occurred for word: {word}")
sim_node_ind = similar_match(obj_name, node_names_list, obj_name_vector, node_vec_list)
if sim_node_ind == -1:
graph.add_node(node_counter, text=obj_name, weight=1)
obj_index = node_counter
node_counter += 1
node_names_list.append(obj_name)
node_vec_list.append(obj_name_vector)
else:
obj_index = sim_node_ind
graph.nodes[obj_index]["weight"] += 1
# Edge info
pred_name = x["V"]["span"]
if graph.has_edge(sub_index, obj_index):
sim_ind = similar_match(pred_name, [x["text"] for x in graph[sub_index][obj_index]["preds"]]) # NOTE: no vector matching
if sim_ind==-1:
graph[sub_index][obj_index]["preds"].append({"text":pred_name, "weight":1})
else:
graph[sub_index][obj_index]["preds"][sim_ind]["weight"] += 1
else:
graph.add_edge(sub_index, obj_index, preds=[{"text":pred_name, "weight":1}])
directed_edges[f"{sub_index}-{obj_index}"] = 1
extra_info = {}
extra_info['coref_info'] = coref_info_list
extra_info['oie_info'] = oies_list
extra_info['tfidfs'] = tfidfs_list
extra_info['generic_tfidfs'] = generic_tfidfs
extra_info['word_index_mapping'] = word_index_mapping
extra_info['total_num_words'] = total_num_words
graph_info = {'graph': graph, 'directed_edges': directed_edges}
return graph_info, extra_info