in source/sagemaker/src/package/data_privatization/data_privatization.py [0:0]
def privatize_example(example, local_vocab, local_embedding_dims, local_epsilon):
from annoy import AnnoyIndex
# Load files
local_index = AnnoyIndex(local_embedding_dims, 'euclidean')
local_index.load(SparkFiles.get("index.ann"))
sensitive_phrases = [x.strip() for x in clean_example(example) if x.strip()]
privatized_phrases = []
for sensitive_phrase in sensitive_phrases:
privatized_words = []
for sensitive_word in sensitive_phrase.split(' '):
privatized_word = replace_word(
sensitive_word, local_vocab, local_epsilon, local_index, local_embedding_dims)
privatized_words.append(privatized_word)
# Flatten nested list of words
privatized_phrases.append(itertools.chain(*[privatized_words]))
privatized_review = " ".join(list(itertools.chain(*privatized_phrases)))
privatized_row = "\"{}\",{}".format(privatized_review, example.sentiment)
return privatized_row