in distant_supervision/input_parser.py [0:0]
def _process_row(self, row):
raw_row = row
preprocessed_row = PrepWikipedia.preprocess(raw_row)
if not preprocessed_row:
return None
article = Article()
for k in ['text', 'title']:
raw_row[k] = self.text_preprocessor.unicode_normalize(raw_row[k])
article.import_from(raw_row)
# article.tok = ' '.join(self.text_preprocessor.word_tokenize(article.text))
sentence_str_lst = self.text_preprocessor.sent_tokenize(article.text, article.title)
sentence_structs = []
for sent_str in sentence_str_lst:
ents, noun_chunks = self.text_preprocessor.compute_ner_and_noun_chunks(sent_str)
if len(ents) + len(noun_chunks) == 0:
# discard sentences that have neither
continue
sentence_structs.append(dict(
id=utils.random_str(16),
text=sent_str,
noun_chunks=noun_chunks,
ents=ents))
article.sents = sentence_structs
return article