cs_CZ/thesaurus/dictionary-to-thesaurus.py (126 lines of code) (raw):
#!/usr/bin/env python
# coding=utf-8
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# This utility translates a normal dictionary (in this case English/Czech)
# into a thesaurus for one of the languages (in this case Czech).
#
# Based on idea of Zdenek Zabokrtsky <zabokrtsky@ufal.mff.cuni.cz>, big
# thanks! :-)
import os
import re
import sys
def usage():
message = """Usage: {program} en-cs.txt blacklist.txt
en-cs.txt: Dictionary data from https://www.svobodneslovniky.cz/
blacklist.txt: List of words that should be ignored when generating
"""
sys.stderr.write(message.format(program = os.path.basename(sys.argv[0])))
def classify(typ):
if typ == '':
return ''
elif typ == 'adj':
return '(příd. jm.)'
elif typ == 'adv':
return '(přísl.)'
elif typ == 'n':
return '(podst. jm.)'
elif typ == 'v':
return '(slov.)'
return ''
def parse(filename, blacklistname):
blacklist = {}
with open(blacklistname, "r") as fp:
for line in fp:
if (line == ''):
continue
elif (line[0] == '#'):
continue
else:
blacklist[line.strip(' \n')] = 1
synonyms = {}
meanings = {}
classification = {}
match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
match_cleanup = re.compile('(\[.*\]|\*|:.*)')
with open(filename, "r") as fp:
for line in fp:
if (line == ''):
continue
elif (line[0] == '#'):
continue
else:
terms = line.split('\t')
if (terms[0] == '' or len(terms) < 2):
continue
index = terms[0].strip()
if (index == ''):
continue
word = terms[1].strip()
if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
word = word.strip('" ')
if (word == ''):
continue
if (index + '\t' + word in blacklist or
index in blacklist or
index + '\t' in blacklist or
'\t' + word in blacklist):
continue
typ = ''
if (len(terms) >= 2):
typ = terms[2]
# ignore non-translations
if match_ignore.search(typ) != None:
continue
typ = match_cleanup.sub('', typ)
typ = typ.strip()
typ = classify(typ)
if index in synonyms:
synonyms[index].append( (word, typ) )
else:
synonyms[index] = [ (word, typ) ]
if word in meanings:
meanings[word].append(index)
else:
meanings[word] = [ index ]
if typ != '':
if word in classification:
if not typ in classification[word]:
classification[word].append(typ)
else:
classification[word] = [ typ ]
return (synonyms, meanings, classification)
def buildThesaurus(synonyms, meanings, classification):
# for every word:
# find all the indexes, and then again map the indexes to words - these are the synonyms
for word in sorted(meanings.keys()):
# we assume that various indexes (english words here) are various
# meanings; not generally true, but...
indexes = meanings[word]
# only limit the words if the type is unambiguous
typ = ''
if word in classification and len(classification[word]) == 1:
typ = classification[word][0]
# we want to output each word just once
used_this_round = [ word ]
output_lines = []
for index in indexes:
syns = synonyms[index]
# collect types first
types = []
for (w, t) in syns:
if not t in types:
types.append(t)
# build the various thesaurus lines
line = {}
for syn in syns:
(w, t) = syn
if typ != '' and t != '' and typ != t:
continue
if not w in used_this_round:
if t in line:
line[t] += '|' + w
else:
line[t] = '|' + w
used_this_round.append(w)
if len(line) != 0:
for t in types:
if t in line:
output_lines.append( (t, line[t]) )
if len(output_lines) > 0:
print word + '|' + str(len(output_lines))
# those with existing classification are probably a better fit,
# put them to the front (even if we don't output the
# classification in the end)
for i in [0, 1]:
for (t, line) in output_lines:
# first pass only non-empty, 2nd pass only empty
if (i == 0 and t != '') or (i == 1 and t == ''):
if typ == '':
print t + line
else:
print line
def main(args):
if (len(args) != 3):
usage()
sys.exit(1)
(synonyms, meanings, classification) = parse(args[1], args[2])
print "UTF-8"
buildThesaurus(synonyms, meanings, classification)
if __name__ == "__main__":
main(sys.argv)
# vim:set shiftwidth=4 softtabstop=4 expandtab: