def getPossibleMatches()

in data_preparation/metadata_completion/DuplicateSearch.py [0:0]


def getPossibleMatches(allGroups):

    output = []
    for group in allGroups:
        group.sort(key=lambda x: x[0])
        groupSize = len(group)
        indexStart = 0
        while indexStart < groupSize - 1:
            currMatch = []
            currTitle, currTags, currMetdataName = group[indexStart]
            for indexEnd in range(indexStart + 1, groupSize):
                nextTitle, nextTags, nextMetadataName = group[indexEnd]
                isSame = True
                if currTitle == nextTitle:
                    for tag in currTags:
                        if tag in ["version", "abridged", "dramatic reading"]:
                            continue
                        if nextTags.get(tag, None) != currTags[tag]:
                            isSame = False
                            break
                    if isSame:
                        currMatch.append(nextMetadataName)
                else:
                    break
            indexStart = indexEnd
            if len(currMatch) > 0:
                currMatch.append(currMetdataName)
                output.append(currMatch)
    return output