def sent_tokenize()

in distant_supervision/text_preprocessor.py [0:0]

11 lines of code
7 McCabe index (conditional complexity)


    def sent_tokenize(self, raw_text, title):
        """
        :return: a list of ...
        """
        # There are different types of sentence segmentation. See
        # https://spacy.io/usage/linguistic-features#sbd for more details
        # The sentencizer is much faster, but not as good as DependencyParser
        # Alternatively, nlp = SpacyMagic.load('en_core_web_sm')  # using DependencyParser
        nlp = SpacyMagic.load_en_sentencizer()

        text_lst = re.split(r'[\n\r]+', raw_text)
        if title and text_lst[0] == title:
            # remove the first element if is the same as the title
            text_lst = text_lst[1:]

        sentences_agg = []
        for text in text_lst:
            doc = nlp(text)
            sentences = [sent.string.strip() for sent in doc.sents]
            sentences_agg.extend(sentences)
        return sentences_agg