lib/core/crossref.py (63 lines of code) (raw):
#!/usr/bin/env python
#
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0 and the following additional limitation. Functionality enabled by the
# files subject to the Elastic License 2.0 may only be used in production when
# invoked by an Elasticsearch process with a license key installed that permits
# use of machine learning features. You may not use this file except in
# compliance with the Elastic License 2.0 and the foregoing additional
# limitation.
#
#
# Script for cross-referencing words in scowl.dict with those in mobyposi.txt to
# create en.dict
#
# scowl.dict contains a more reasonably sized word list, but only mobyposi.txt
# has the part-of-speech codes
#
# There are some heuristics to cope with the lack of derived words in
# mobyposi.txt; if none of these work then the type is set to '?'
#
SEPARATOR = '@'
lookupTable = {}
with open('mobyposi.txt', 'r') as mobyFile:
for line in mobyFile:
parts = line.strip().split(SEPARATOR)
if len(parts) == 2:
word = parts[0].lower()
partOfSpeechCode = parts[1]
if not word in lookupTable:
lookupTable[word] = partOfSpeechCode
with open('scowl.dict', 'r') as scowlFile, open('en.dict', 'w') as mappedFile:
for line in scowlFile:
word = line.strip()
if word in lookupTable:
mappedFile.write(word)
mappedFile.write(SEPARATOR)
origPartOfSpeechCode = lookupTable[word]
if word[:1] == 'a' and \
origPartOfSpeechCode[:1] == 'D':
mappedFile.write(origPartOfSpeechCode.replace('D', 'I', 1))
else:
mappedFile.write(origPartOfSpeechCode)
mappedFile.write('\n')
elif word[len(word) - 1:] == 's' and \
word[:len(word) - 1] in lookupTable:
mappedFile.write(word)
mappedFile.write(SEPARATOR)
origPartOfSpeechCode = lookupTable[word[:len(word) - 1]]
if origPartOfSpeechCode.find('N') == -1:
mappedFile.write(origPartOfSpeechCode.replace('A', 'p', 1))
else:
mappedFile.write(origPartOfSpeechCode.replace('N', '&', 1).replace('A', '&', 1).replace('&', 'p', 1).replace('&', ''))
mappedFile.write('\n')
elif word[len(word) - 2:] == 'ed' and \
(word[:len(word) - 2] in lookupTable or word[:len(word) - 1] in lookupTable):
mappedFile.write(word)
mappedFile.write(SEPARATOR)
if word[:len(word) - 2] in lookupTable:
origPartOfSpeechCode = lookupTable[word[:len(word) - 2]]
else:
origPartOfSpeechCode = lookupTable[word[:len(word) - 1]]
mappedFile.write(origPartOfSpeechCode.replace('N', '&', 1).replace('p', '&', 1).replace('A', '&', 1).replace('&', 'A', 1).replace('&', ''))
mappedFile.write('\n')
elif word[len(word) - 3:] == 'ing' and \
(word[:len(word) - 3] in lookupTable or word[:len(word) - 3] + 'e' in lookupTable):
if word[:len(word) - 3] in lookupTable:
origPartOfSpeechCode = lookupTable[word[:len(word) - 3]]
else:
origPartOfSpeechCode = lookupTable[word[:len(word) - 3] + 'e']
mappedFile.write(word)
mappedFile.write(SEPARATOR)
mappedFile.write(origPartOfSpeechCode.replace('N', '&', 1).replace('p', '&', 1).replace('A', '&', 1).replace('&', 'A', 1).replace('&', ''))
mappedFile.write('\n')
else:
mappedFile.write(word)
mappedFile.write(SEPARATOR)
if word[len(word) - 1:] == 's' and \
word[len(word) - 2:] != 'es' and \
word[len(word) - 2:] != 'ss':
mappedFile.write('p')
else:
mappedFile.write('?')
mappedFile.write('\n')