in cs_CZ/thesaurus/dictionary-to-thesaurus.py [0:0]
def parse(filename, blacklistname):
blacklist = {}
with open(blacklistname, "r") as fp:
for line in fp:
if (line == ''):
continue
elif (line[0] == '#'):
continue
else:
blacklist[line.strip(' \n')] = 1
synonyms = {}
meanings = {}
classification = {}
match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
match_cleanup = re.compile('(\[.*\]|\*|:.*)')
with open(filename, "r") as fp:
for line in fp:
if (line == ''):
continue
elif (line[0] == '#'):
continue
else:
terms = line.split('\t')
if (terms[0] == '' or len(terms) < 2):
continue
index = terms[0].strip()
if (index == ''):
continue
word = terms[1].strip()
if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
word = word.strip('" ')
if (word == ''):
continue
if (index + '\t' + word in blacklist or
index in blacklist or
index + '\t' in blacklist or
'\t' + word in blacklist):
continue
typ = ''
if (len(terms) >= 2):
typ = terms[2]
# ignore non-translations
if match_ignore.search(typ) != None:
continue
typ = match_cleanup.sub('', typ)
typ = typ.strip()
typ = classify(typ)
if index in synonyms:
synonyms[index].append( (word, typ) )
else:
synonyms[index] = [ (word, typ) ]
if word in meanings:
meanings[word].append(index)
else:
meanings[word] = [ index ]
if typ != '':
if word in classification:
if not typ in classification[word]:
classification[word].append(typ)
else:
classification[word] = [ typ ]
return (synonyms, meanings, classification)