def load_europarl_data()

in src/evaluation/sent_translation.py [0:0]


def load_europarl_data(lg1, lg2, n_max=1e10, lower=True):
    """
    Load data parallel sentences
    """
    if not (os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg1, lg2, lg1))) or
            os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg2, lg1, lg1)))):
        return None

    if os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg2, lg1, lg1))):
        lg1, lg2 = lg2, lg1

    # load sentences
    data = {lg1: [], lg2: []}
    for lg in [lg1, lg2]:
        fname = os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg1, lg2, lg))

        with io.open(fname, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= n_max:
                    break
                line = line.lower() if lower else line
                data[lg].append(line.rstrip().split())

    # get only unique sentences for each language
    assert len(data[lg1]) == len(data[lg2])
    data[lg1] = np.array(data[lg1])
    data[lg2] = np.array(data[lg2])
    data[lg1], indices = np.unique(data[lg1], return_index=True)
    data[lg2] = data[lg2][indices]
    data[lg2], indices = np.unique(data[lg2], return_index=True)
    data[lg1] = data[lg1][indices]

    # shuffle sentences
    rng = np.random.RandomState(1234)
    perm = rng.permutation(len(data[lg1]))
    data[lg1] = data[lg1][perm]
    data[lg2] = data[lg2][perm]

    logger.info("Loaded europarl %s-%s (%i sentences)." % (lg1, lg2, len(data[lg1])))
    return data