in data_preparation/metadata_completion/DuplicateSearch.py [0:0]
def getPossibleMatches(allGroups):
output = []
for group in allGroups:
group.sort(key=lambda x: x[0])
groupSize = len(group)
indexStart = 0
while indexStart < groupSize - 1:
currMatch = []
currTitle, currTags, currMetdataName = group[indexStart]
for indexEnd in range(indexStart + 1, groupSize):
nextTitle, nextTags, nextMetadataName = group[indexEnd]
isSame = True
if currTitle == nextTitle:
for tag in currTags:
if tag in ["version", "abridged", "dramatic reading"]:
continue
if nextTags.get(tag, None) != currTags[tag]:
isSame = False
break
if isSame:
currMatch.append(nextMetadataName)
else:
break
indexStart = indexEnd
if len(currMatch) > 0:
currMatch.append(currMetdataName)
output.append(currMatch)
return output