in src/evaluation/sent_translation.py [0:0]
def load_europarl_data(lg1, lg2, n_max=1e10, lower=True):
"""
Load data parallel sentences
"""
if not (os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg1, lg2, lg1))) or
os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg2, lg1, lg1)))):
return None
if os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg2, lg1, lg1))):
lg1, lg2 = lg2, lg1
# load sentences
data = {lg1: [], lg2: []}
for lg in [lg1, lg2]:
fname = os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg1, lg2, lg))
with io.open(fname, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if i >= n_max:
break
line = line.lower() if lower else line
data[lg].append(line.rstrip().split())
# get only unique sentences for each language
assert len(data[lg1]) == len(data[lg2])
data[lg1] = np.array(data[lg1])
data[lg2] = np.array(data[lg2])
data[lg1], indices = np.unique(data[lg1], return_index=True)
data[lg2] = data[lg2][indices]
data[lg2], indices = np.unique(data[lg2], return_index=True)
data[lg1] = data[lg1][indices]
# shuffle sentences
rng = np.random.RandomState(1234)
perm = rng.permutation(len(data[lg1]))
data[lg1] = data[lg1][perm]
data[lg2] = data[lg2][perm]
logger.info("Loaded europarl %s-%s (%i sentences)." % (lg1, lg2, len(data[lg1])))
return data