in lambdachecker/sms_spam_classifier_utilities.py [0:0]
def hashing_trick(text, n,
hash_function=None,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True,
split=' '):
"""Converts a text to a sequence of indexes in a fixed-size hashing space.
# Arguments
text: Input text (string).
n: Dimension of the hashing space.
hash_function: defaults to python `hash` function, can be 'md5' or
any function that takes in input a string and returns a int.
Note that 'hash' is not a stable hashing function, so
it is not consistent across different runs, while 'md5'
is a stable hashing function.
filters: list (or concatenation) of characters to filter out, such as
punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
includes basic punctuation, tabs, and newlines.
lower: boolean. Whether to set the text to lowercase.
split: str. Separator for word splitting.
# Returns
A list of integer word indices (unicity non-guaranteed).
`0` is a reserved index that won't be assigned to any word.
Two or more words may be assigned to the same index, due to possible
collisions by the hashing function.
The [probability](
https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
of a collision is in relation to the dimension of the hashing space and
the number of distinct objects.
"""
if hash_function is None:
hash_function = hash
elif hash_function == 'md5':
hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)
seq = text_to_word_sequence(text,
filters=filters,
lower=lower,
split=split)
return [int(hash_function(w) % (n - 1) + 1) for w in seq]