in lib/datasets/vg_wiki_and_relco_lan.py [0:0]
def __init__(self):
self._data_path = os.path.join(cfg.DATA_DIR, 'Visual_Genome')
assert os.path.exists(self._data_path), \
'Path does not exist: {}'.format(self._data_path)
_object_categories = []
with open(self._data_path + '/object_categories_spo_joined_and_merged.txt') as obj_categories:
for line in obj_categories:
_object_categories.append(line[:-1])
self._object_categories = list(set(_object_categories))
print(len(self._object_categories))
_predicate_categories = []
with open(self._data_path + '/predicate_categories_spo_joined_and_merged.txt') as prd_categories:
for line in prd_categories:
_predicate_categories.append(line[:-1])
self._predicate_categories = list(set(_predicate_categories))
print(len(self._predicate_categories))
cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache'))
cache_file = os.path.join(cache_path, 'vg_wiki_and_relco_gt_landb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
landb = cPickle.load(fid)
logger.info('vg wiki and relco spo joined and merged gt landb loaded from {}'.format(cache_file))
self.obj_vecs = landb['obj_vecs']
self.prd_vecs = landb['prd_vecs']
return
self.model = None
self.relco_model = None
self.relco_vec_mean = None
# Load gt data from scratch
# Load Google's pre-trained Word2Vec model.
self.model = gensim.models.KeyedVectors.load_word2vec_format(
cfg.DATA_DIR + '/models/GoogleNews-vectors-negative300.bin', binary=True)
print('Model loaded.')
# change everything into lowercase
for key in self.model.vocab.keys():
new_key = key.lower()
self.model.vocab[new_key] = self.model.vocab.pop(key)
print('Wiki words converted to lowercase.')
# Load gt data from scratch
# Load rel_cooccur_300d model.
self.relco_model = gensim.models.Word2Vec.load(
cfg.DATA_DIR + '/label_embeddings/vg_300d_skipgram_rel')
print('Model loaded.')
self.relco_vec_mean = self.relco_model.wv.syn0.mean(axis=0)
# change everything into lowercase
for key in self.relco_model.vocab.keys():
new_key = key.lower()
self.relco_model.vocab[new_key] = self.relco_model.vocab.pop(key)
print('Relco words converted to lowercase.')
half_dim = int(cfg.INPUT_LANG_EMBEDDING_DIM / 2)
# get word vectors for all categories
self.obj_vecs = np.zeros((len(self._object_categories),
cfg.INPUT_LANG_EMBEDDING_DIM), dtype=np.float32)
for ix, name in enumerate(_object_categories):
obj_vecs_wiki = np.zeros(half_dim, dtype=np.float32)
words = name.split()
for word in words:
if word in self.model.vocab:
raw_word = self.model[word]
obj_vecs_wiki += (raw_word / la.norm(raw_word))
else:
print('Singular word found: ', word)
raise NameError('Terminated.')
obj_vecs_wiki /= len(words)
obj_vecs_wiki /= la.norm(obj_vecs_wiki)
obj_vecs_relco = np.zeros(half_dim, dtype=np.float32)
words = name.split()
for word in words:
if word in self.relco_model.vocab:
raw_word = self.relco_model[word]
obj_vecs_relco += (raw_word / la.norm(raw_word))
else:
obj_vecs_relco += \
(self.relco_vec_mean / la.norm(self.relco_vec_mean))
obj_vecs_relco /= len(words)
obj_vecs_relco /= la.norm(obj_vecs_relco)
self.obj_vecs[ix][:half_dim] = obj_vecs_wiki
self.obj_vecs[ix][half_dim:] = obj_vecs_relco
self.prd_vecs = np.zeros((len(self._predicate_categories),
cfg.INPUT_LANG_EMBEDDING_DIM), dtype=np.float32)
for ix, name in enumerate(_predicate_categories):
prd_vecs_wiki = np.zeros(half_dim, dtype=np.float32)
words = name.split()
for word in words:
if word in self.model.vocab:
raw_word = self.model[word]
prd_vecs_wiki += (raw_word / la.norm(raw_word))
else:
print('Singular word found: ', word)
raise NameError('Terminated.')
prd_vecs_wiki /= len(words)
prd_vecs_wiki /= la.norm(prd_vecs_wiki)
prd_vecs_relco = np.zeros(half_dim, dtype=np.float32)
words = name.split()
for word in words:
if word in self.relco_model.vocab:
raw_word = self.relco_model[word]
prd_vecs_relco += (raw_word / la.norm(raw_word))
else:
prd_vecs_relco += \
(self.relco_vec_mean / la.norm(self.relco_vec_mean))
prd_vecs_relco /= len(words)
prd_vecs_relco /= la.norm(prd_vecs_relco)
self.prd_vecs[ix][:half_dim] = prd_vecs_wiki
self.prd_vecs[ix][half_dim:] = prd_vecs_relco
landb = dict(obj_vecs=self.obj_vecs, prd_vecs=self.prd_vecs)
with open(cache_file, 'wb') as fid:
cPickle.dump(landb, fid, cPickle.HIGHEST_PROTOCOL)
print('wrote gt roidb to {}'.format(cache_file))