in reference/src/main/python/similar.py [0:0]
def get_completions3(query_record, candidate_records, top_n, threshold1, threshold2):
l = len(candidate_records)
ret = []
acc = []
for i in range(l):
ret.append([i])
changed = True
while changed:
ret2 = []
changed = False
for tuple in ret:
kmax = None
maxscore = 0
for k in range(tuple[-1] + 1, l):
record_list1 = []
record_list2 = []
for i in tuple:
record_list1.append(candidate_records[i][2])
record_list2.append(candidate_records[i][0])
record_list1.append(candidate_records[k][2])
record_list2.append(candidate_records[k][0])
qlen = sum(Counter(record_list1[0]["features"]).values())
iscore = find_similarity_score_features_set_un(record_list1)
pscore = iscore / qlen
# pscore = find_similarity_score_features_set(record_list1)
if pscore > threshold1:
query_score_un = find_similarity_score_features_set_un(record_list1)
tmp_score = find_similarity_score_features_set_un(record_list2)
if tmp_score > threshold2 * query_score_un and tmp_score > maxscore:
kmax = k
maxscore = tmp_score
if kmax is not None:
changed = True
ret2.append(tuple + [kmax])
acc = ret2 + acc
ret = ret2
ret = []
acc = sorted(acc, key=lambda t: t[0] * 1000 - len(t))
for i in range(len(acc)):
tuple = acc[i]
logging.info(f"Pruning {len(tuple)} {tuple}")
is_subset = False
s = set(tuple)
for j in reversed(range(i)):
if distance(Counter(tuple), Counter(acc[j])) > 0.5:
is_subset = True
if not is_subset:
print(f"Pruning {len(tuple)} {tuple}")
logging.info("recommending")
pruned_record = candidate_records[tuple[0]][0]
for j in range(1, len(tuple)):
pruned_record = prune_last_jd(
[query_record, candidate_records[tuple[j]][0]], pruned_record
)
ret.append([pruned_record, candidate_records[tuple[0]][0]] + tuple)
if len(ret) >= top_n:
return ret
return ret