def create_clean_counter()

in source/sagemaker/src/package/data_privatization/data_privatization.py [0:0]


def create_clean_counter(input_data, add_space_split=False):
    phrase_count = Counter()
    for example in input_data:
        review = example.review
        original_text = " ".join(review)
        text = original_text.replace(
            " ' ", "").replace("'", "").replace("/", " ").replace("  ", " ").replace('"', '')
        if add_space_split:
            text = re.split('\!|\,|\n|\.|\?|\-|\;|\:|\(|\)|\s', text)
        else:
            text = re.split('\!|\,|\n|\.|\?|\-|\;|\:|\(|\)', text)
        sentences = [x.strip() for x in text if x.strip()]
        for sentence in sentences:
            phrase_count[sentence] += 1
    return phrase_count