def count_words_per_sentence()

in data_measurements/npmi/npmi.py [0:0]


    def count_words_per_sentence(self):
        # Counts the number of each vocabulary item per-sentence in batches.
        logs.info("Creating co-occurrence matrix for nPMI calculations.")
        word_cnts_per_sentence = []
        logs.info(self.tokenized_sentence_df)
        batches = np.linspace(0, self.tokenized_sentence_df.shape[0],
                              NUM_BATCHES).astype(int)
        # Creates matrix of size # batches x # sentences
        for batch_num in range(len(batches) - 1):
            # Makes matrix shape: batch size (# sentences) x # words,
            # with the occurrence of each word per sentence.
            # vocab_counts_df.index is the vocabulary.
            mlb = MultiLabelBinarizer(classes=self.vocabulary)
            if batch_num % 100 == 0:
                logs.debug(
                    "%s of %s sentence binarize batches." % (
                        str(batch_num), str(len(batches)))
                )
            # Per-sentence word counts
            sentence_batch = self.tokenized_sentence_df[
                             batches[batch_num]:batches[batch_num + 1]]
            mlb_series = mlb.fit_transform(sentence_batch)
            word_cnts_per_sentence.append(mlb_series)
        return word_cnts_per_sentence