def calc_cooccurrences()

in data_measurements/npmi/npmi.py [0:0]


    def calc_cooccurrences(self, subgroup, subgroup_idx):
        initialize = True
        coo_df = None
        # Big computation here!  Should only happen once.
        logs.debug(
            "Approaching big computation! Here, we binarize all words in the "
            "sentences, making a sparse matrix of sentences."
        )
        for batch_id in range(len(self.word_cnts_per_sentence)):
            # Every 100 batches, print out the progress.
            if not batch_id % 100:
                logs.debug(
                    "%s of %s co-occurrence count batches"
                    % (str(batch_id), str(len(self.word_cnts_per_sentence)))
                )
            # List of all the sentences (list of vocab) in that batch
            batch_sentence_row = self.word_cnts_per_sentence[batch_id]
            # Dataframe of # sentences in batch x vocabulary size
            sent_batch_df = pd.DataFrame(batch_sentence_row)
            # Subgroup counts per-sentence for the given batch
            subgroup_df = sent_batch_df[subgroup_idx]
            subgroup_df.columns = [subgroup]
            # Remove the sentences where the count of the subgroup is 0.
            # This way we have less computation & resources needs.
            subgroup_df = subgroup_df[subgroup_df > 0]
            mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
            # Create cooccurrence matrix for the given subgroup and all words.
            batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))

            # Creates a batch-sized dataframe of co-occurrence counts.
            # Note these could just be summed rather than be batch size.
            if initialize:
                coo_df = batch_coo_df
            else:
                coo_df = coo_df.add(batch_coo_df, fill_value=0)
            initialize = False
        logs.debug("Made co-occurrence matrix")
        logs.debug(coo_df)
        count_df = coo_df.set_index(self.vocab_counts_df.index)
        count_df.columns = ["count"]
        count_df["count"] = count_df["count"].astype(int)
        return count_df