in sockeye/vocab.py [0:0]
def load_or_create_vocabs(shard_source_paths: Iterable[Iterable[str]],
shard_target_paths: Iterable[Iterable[str]],
source_vocab_paths: List[Optional[str]],
source_factor_vocab_same_as_source: List[bool],
target_vocab_paths: List[Optional[str]],
target_factor_vocab_same_as_target: List[bool],
shared_vocab: bool,
num_words_source: Optional[int], word_min_count_source: int,
num_words_target: Optional[int], word_min_count_target: int,
pad_to_multiple_of: Optional[int] = None,
mapper: Callable = map) -> Tuple[List[Vocab], List[Vocab]]:
"""
Returns vocabularies for source files (including factors) and target files (including factors.
If the respective vocabulary paths are not None, the vocabulary is read from the path and returned.
Otherwise, it is built from the support and saved to the path.
:param shard_source_paths: List of shards of list paths to the source text (and optional token-parallel factor files).
:param shard_target_paths: List of shards of list paths to the target text (and optional token-parallel factor files).
:param source_vocab_paths: The source vocabulary path (and optional factor vocabulary paths).
:param source_factor_vocab_same_as_source: List of bools whether factor vocabulary is equal to primary factor.
:param target_vocab_paths: The target vocabulary path (and optional factor vocabulary paths).
:param target_factor_vocab_same_as_target: List of bools whether factor vocabulary is equal to primary factor.
:param shared_vocab: Whether the source and target vocabularies are shared.
:param num_words_source: Number of words in the source vocabulary.
:param word_min_count_source: Minimum frequency of words in the source vocabulary.
:param num_words_target: Number of words in the target vocabulary.
:param word_min_count_target: Minimum frequency of words in the target vocabulary.
:param pad_to_multiple_of: If not None, pads the vocabularies to a size that is the next multiple of this int.
:param mapper: Built-in map function or multiprocessing.pool.map with max_processes processes.
:return: List of source vocabularies (for source and factors), and target vocabulary.
"""
shard_source_sentence_paths: Tuple[str, ...]
shard_source_factor_paths: List[Tuple[str, ...]]
shard_target_sentence_paths: Tuple[str, ...]
shard_target_factor_paths: List[Tuple[str, ...]]
shard_source_sentence_paths, *shard_source_factor_paths = [paths for paths in zip(*shard_source_paths)] # type: ignore
source_vocab_path, *source_factor_vocab_paths = source_vocab_paths
shard_target_sentence_paths, *shard_target_factor_paths = [paths for paths in zip(*shard_target_paths)] # type: ignore
target_vocab_path, *target_factor_vocab_paths = target_vocab_paths
logger.info("=============================")
logger.info("Loading/creating vocabularies")
logger.info("=============================")
logger.info("(1) Surface form vocabularies (source & target)")
if shared_vocab:
if source_vocab_path and target_vocab_path:
vocab_source = vocab_from_json(source_vocab_path)
vocab_target = vocab_from_json(target_vocab_path)
utils.check_condition(are_identical(vocab_source, vocab_target),
"Shared vocabulary requires identical source and target vocabularies. "
"The vocabularies in %s and %s are not identical." % (source_vocab_path,
target_vocab_path))
elif source_vocab_path is None and target_vocab_path is None:
utils.check_condition(num_words_source == num_words_target,
"A shared vocabulary requires the number of source and target words to be the same.")
utils.check_condition(word_min_count_source == word_min_count_target,
"A shared vocabulary requires the minimum word count for source and target "
"to be the same.")
vocab_source = vocab_target = build_from_paths(paths=shard_source_sentence_paths + shard_target_sentence_paths,
num_words=num_words_source,
min_count=word_min_count_source,
pad_to_multiple_of=pad_to_multiple_of,
mapper=mapper)
else:
vocab_path = source_vocab_path if source_vocab_path is not None else target_vocab_path
logger.info("Using %s as a shared source/target vocabulary." % vocab_path)
vocab_source = vocab_target = vocab_from_json(vocab_path)
else:
vocab_source = load_or_create_vocab(shard_source_sentence_paths, source_vocab_path, num_words_source, word_min_count_source,
pad_to_multiple_of=pad_to_multiple_of, mapper=mapper)
vocab_target = load_or_create_vocab(shard_target_sentence_paths, target_vocab_path, num_words_target, word_min_count_target,
pad_to_multiple_of=pad_to_multiple_of, mapper=mapper)
vocab_source_factors = [] # type: List[Vocab]
if shard_source_factor_paths:
logger.info("(2) Additional source factor vocabularies")
if len(source_factor_vocab_same_as_source) > 1:
utils.check_condition(len(source_factor_vocab_same_as_source) == len(shard_source_factor_paths),
"The number of flags for sharing the vocabulary of "
"source factors does not match the number of source "
"factors.")
elif len(source_factor_vocab_same_as_source) == 1:
source_factor_vocab_same_as_source = source_factor_vocab_same_as_source * len(shard_source_factor_paths)
else:
source_factor_vocab_same_as_source = [False] * len(shard_source_factor_paths)
for shard_factor_paths, factor_vocab_path, share_source_vocab in zip(shard_source_factor_paths,
source_factor_vocab_paths,
source_factor_vocab_same_as_source):
if not share_source_vocab:
vocab_source_factors.append(load_or_create_vocab(shard_factor_paths, factor_vocab_path,
num_words_source, word_min_count_source,
pad_to_multiple_of=pad_to_multiple_of,
mapper=mapper))
else:
vocab_source_factors.append(vocab_source)
vocab_target_factors = [] # type: List[Vocab]
if shard_target_factor_paths:
logger.info("(3) Additional target factor vocabularies")
if len(target_factor_vocab_same_as_target) > 1:
utils.check_condition(len(target_factor_vocab_same_as_target) == len(shard_target_factor_paths),
"The number of flags for sharing the vocabulary of "
"target factors does not match the number of target "
"factors.")
elif len(target_factor_vocab_same_as_target) == 1:
target_factor_vocab_same_as_target = target_factor_vocab_same_as_target * len(shard_target_factor_paths)
else:
target_factor_vocab_same_as_target = [False] * len(shard_target_factor_paths)
for shard_factor_paths, factor_vocab_path, share_target_vocab in zip(shard_target_factor_paths,
target_factor_vocab_paths,
target_factor_vocab_same_as_target):
if not share_target_vocab:
vocab_target_factors.append(load_or_create_vocab(shard_factor_paths, factor_vocab_path,
num_words_target, word_min_count_target,
pad_to_multiple_of=pad_to_multiple_of,
mapper=mapper))
else:
vocab_target_factors.append(vocab_target)
return [vocab_source] + vocab_source_factors, [vocab_target] + vocab_target_factors