def _process_row()

in distant_supervision/input_parser.py [0:0]


    def _process_row(self, row):
        raw_row = row

        preprocessed_row = PrepWikipedia.preprocess(raw_row)
        if not preprocessed_row:
            return None

        article = Article()

        for k in ['text', 'title']:
            raw_row[k] = self.text_preprocessor.unicode_normalize(raw_row[k])

        article.import_from(raw_row)

        # article.tok = ' '.join(self.text_preprocessor.word_tokenize(article.text))

        sentence_str_lst = self.text_preprocessor.sent_tokenize(article.text, article.title)

        sentence_structs = []
        for sent_str in sentence_str_lst:
            ents, noun_chunks = self.text_preprocessor.compute_ner_and_noun_chunks(sent_str)

            if len(ents) + len(noun_chunks) == 0:
                # discard sentences that have neither
                continue

            sentence_structs.append(dict(
                id=utils.random_str(16),
                text=sent_str,
                noun_chunks=noun_chunks,
                ents=ents))

        article.sents = sentence_structs

        return article