in question_generation_model.py [0:0]
def dissimilarity_grouping(self, temp, num_groups, beam_size):
"""
This function is responsible for clustering hypothesis at each step to produce diverse solutions
:param temp: Candidate solutions
:param num_groups: Number of clusters
:param beam_size: Beam size to be used for decoding
:return:
"""
all_sentences = []
for s, prob in temp:
intermediate_question = [self.datasets.idx_to_word[i] for i in s]
all_sentences.append(' '.join(intermediate_question[1:]))
self.logger.debug('Sentences being clustered %s' % all_sentences)
dict_map_sentence_to_idx = dict()
for i, s in enumerate(all_sentences):
if s not in dict_map_sentence_to_idx:
dict_map_sentence_to_idx[s] = i
uniq_sentences = list(dict_map_sentence_to_idx.keys())
try:
retained = dict(cluster_texts(uniq_sentences, num_groups))
except:
# logging.error('Error with clustering in diversity beam search')
return temp
# Add best 2 out of the box in retained_sentences
picked = [len(all_sentences)-2, len(all_sentences)-1]
retained_sentences = [temp[idx] for idx in picked]
while(1):
if len(retained_sentences) >= beam_size:
break
for key, values in retained.items():
# Pick one from each cluster
idx = random.choice(values)
if idx not in picked:
picked.append(idx)
if len(retained_sentences) >= beam_size:
break
retained_sentences.append(temp[idx])
self.logger.debug('Filtered sentences')
for s in retained_sentences:
iq = ' '.join([self.datasets.idx_to_word[i] for i in s[0]])
self.logger.debug(iq)
return retained_sentences