def remove_spam_paragraphs()

in build_obelics/13_final_processing.py [0:0]


def remove_spam_paragraphs(texts, images, metadata):
    new_texts = []
    for text in texts:
        if text is None:
            new_texts.append(None)
        else:
            paragraphs = text.split("\n\n")
            new_paragraphs = [
                paragraph for paragraph in paragraphs if compute_spam_word_ratio(paragraph) < SPAM_WORD_RATIO_CUTOFF
            ]
            new_text = "\n\n".join(new_paragraphs)
            new_texts.append(new_text)
    return new_texts, images, metadata