in iep/preprocess.py [0:0]
def build_vocab(sequences, min_token_count=1, delim=' ',
punct_to_keep=None, punct_to_remove=None):
token_to_count = {}
tokenize_kwargs = {
'delim': delim,
'punct_to_keep': punct_to_keep,
'punct_to_remove': punct_to_remove,
}
for seq in sequences:
seq_tokens = tokenize(seq, **tokenize_kwargs,
add_start_token=False, add_end_token=False)
for token in seq_tokens:
if token not in token_to_count:
token_to_count[token] = 0
token_to_count[token] += 1
token_to_idx = {}
for token, idx in SPECIAL_TOKENS.items():
token_to_idx[token] = idx
for token, count in sorted(token_to_count.items()):
if count >= min_token_count:
token_to_idx[token] = len(token_to_idx)
return token_to_idx