in distant_supervision/text_preprocessor.py [0:0]
def sent_tokenize(self, raw_text, title):
"""
:return: a list of ...
"""
# There are different types of sentence segmentation. See
# https://spacy.io/usage/linguistic-features#sbd for more details
# The sentencizer is much faster, but not as good as DependencyParser
# Alternatively, nlp = SpacyMagic.load('en_core_web_sm') # using DependencyParser
nlp = SpacyMagic.load_en_sentencizer()
text_lst = re.split(r'[\n\r]+', raw_text)
if title and text_lst[0] == title:
# remove the first element if is the same as the title
text_lst = text_lst[1:]
sentences_agg = []
for text in text_lst:
doc = nlp(text)
sentences = [sent.string.strip() for sent in doc.sents]
sentences_agg.extend(sentences)
return sentences_agg