in source/sagemaker/src/package/data_privatization/data_privatization.py [0:0]
def create_clean_counter(input_data, add_space_split=False):
phrase_count = Counter()
for example in input_data:
review = example.review
original_text = " ".join(review)
text = original_text.replace(
" ' ", "").replace("'", "").replace("/", " ").replace(" ", " ").replace('"', '')
if add_space_split:
text = re.split('\!|\,|\n|\.|\?|\-|\;|\:|\(|\)|\s', text)
else:
text = re.split('\!|\,|\n|\.|\?|\-|\;|\:|\(|\)', text)
sentences = [x.strip() for x in text if x.strip()]
for sentence in sentences:
phrase_count[sentence] += 1
return phrase_count