def getBaseTitle()

in data_preparation/metadata_completion/DuplicateSearch.py [0:0]


def getBaseTitle(title):

    in_str = title.lower()

    labelWords = ["dramatic reading", "abridged"]
    tags = {}

    for label in labelWords:
        if in_str.find(label) >= 0:
            tags[label] = True
            in_str = in_str.replace(label, '')

    tmp = in_str.split()
    baseTitle = ""

    index = 0
    nItems = len(tmp)
    tmp = [''.join([char for char in word if char.isalnum()]) for word in tmp]

    keyWords = ["version", "vol", "chapter", "part", "volume", "book"]
    forbiddenWords = ["a", "the", "of", "in"]

    while index < nItems:
        word = tmp[index]
        if word in keyWords and index < nItems - 1:
            if tmp[index+1].isdigit():
                tags[word] = int(tmp[index+1])
                index += 2
                continue
        elif len(word) > 0 and word not in forbiddenWords:
            if len(baseTitle) > 0:
                baseTitle += " "
            baseTitle += word
        index += 1

    return baseTitle, tags