in data_preparation/metadata_completion/DuplicateSearch.py [0:0]
def getBaseTitle(title):
in_str = title.lower()
labelWords = ["dramatic reading", "abridged"]
tags = {}
for label in labelWords:
if in_str.find(label) >= 0:
tags[label] = True
in_str = in_str.replace(label, '')
tmp = in_str.split()
baseTitle = ""
index = 0
nItems = len(tmp)
tmp = [''.join([char for char in word if char.isalnum()]) for word in tmp]
keyWords = ["version", "vol", "chapter", "part", "volume", "book"]
forbiddenWords = ["a", "the", "of", "in"]
while index < nItems:
word = tmp[index]
if word in keyWords and index < nItems - 1:
if tmp[index+1].isdigit():
tags[word] = int(tmp[index+1])
index += 2
continue
elif len(word) > 0 and word not in forbiddenWords:
if len(baseTitle) > 0:
baseTitle += " "
baseTitle += word
index += 1
return baseTitle, tags