def build()

in problem.py [0:0]
72 lines of code
21 McCabe index (conditional complexity)

    def build(self, data_path_list, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None,
              format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True,
              cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3, max_building_lines=1000*1000):
        """

        Args:
            data_path_list:
            file_columns: {
                  "word1": 0,
                  "word2": 1,
                  "label":   2,
                  "postag_feature1": 3,
                  "postag_feature2": 4
                },
            input_types:
                e.g.
                {
                  "word": {
                    "cols": ["word1", "word2"],
                    "dim": 300
                  },
                  "postag": {
                    "cols": ["postag_feature1", "postag_feature2"],
                    "dim": 20
                  },
                }
                or
                {
                  "bpe": {
                    "cols": ["word1", "word2"],
                    "dim": 100
                    "bpe_path": "xxx.bpe"
                  }
                }

            word2vec_path:
            word_emb_dim:
            involve_all_word: involve all words that show up in the pretrained embedding
            file_format: "tsv", or "json". Note "json" means each sample is represented by a json string.

        Returns:

        """
        # parameter check
        bpe_encoder = self._check_bpe_encoder(input_types)  
        self.file_column_num = len(file_columns)

        for data_path in data_path_list:
            if data_path:
                progress = self.get_data_generator_from_file(data_path, file_with_col_header, chunk_size=max_building_lines)
                preprocessed_data_generator= self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder)
        
                # update symbol universe
                docs, target_docs, cnt_legal, cnt_illegal = next(preprocessed_data_generator)

                # input_type
                for input_type in input_types:
                    self.input_dicts[input_type].update(docs[input_type])
        
                # problem_type
                if ProblemTypes[self.problem_type] == ProblemTypes.classification or \
                    ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
                    self.output_dict.update(list(target_docs.values())[0])
                elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
                        ProblemTypes[self.problem_type] == ProblemTypes.mrc:
                    pass
                logging.info("[Building Dictionary] in %s at most %d lines imported: %d legal lines, %d illegal lines." % (data_path, max_building_lines, cnt_legal, cnt_illegal))
     
        # build dictionary
        for input_type in input_types:
            self.input_dicts[input_type].build(threshold=word_frequency, max_vocabulary_num=max_vocabulary)
            logging.info("%d types in %s column" % (self.input_dicts[input_type].cell_num(), input_type))
        if self.output_dict:
            self.output_dict.build(threshold=0)
            if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
                self.output_dict.cell_id_map["<start>"] = len(self.output_dict.cell_id_map)
                self.output_dict.id_cell_map[len(self.output_dict.id_cell_map)] = "<start>"
                self.output_dict.cell_id_map["<eos>"] = len(self.output_dict.cell_id_map)
                self.output_dict.id_cell_map[len(self.output_dict.id_cell_map)] = "<eos>"
            logging.info("%d types in target column" % (self.output_dict.cell_num()))
        logging.debug("training data dict built")

        # embedding
        word_emb_matrix = None
        if word2vec_path:
            logging.info("Getting pre-trained embeddings...")
            word_emb_dict = None
            if involve_all_words is True:
                word_emb_dict = load_embedding(word2vec_path, word_emb_dim, format, file_type, with_head=False, word_set=None)
                self.input_dicts['word'].update([list(word_emb_dict.keys())])
                self.input_dicts['word'].build(threshold=0, max_vocabulary_num=len(word_emb_dict))
            else:
                extend_vocabulary = set()
                for single_word in self.input_dicts['word'].cell_id_map.keys():
                    extend_vocabulary.add(single_word)
                    if single_word.lower() != single_word:
                        extend_vocabulary.add(single_word.lower())
                word_emb_dict = load_embedding(word2vec_path, word_emb_dim, format, file_type, with_head=False, word_set=extend_vocabulary)

            for word in word_emb_dict:
                loaded_emb_dim = len(word_emb_dict[word])
                break

            assert loaded_emb_dim == word_emb_dim, "The dimension of defined word embedding is inconsistent with the pretrained embedding provided!"

            logging.info("constructing embedding table")
            if self.input_dicts['word'].with_unk:
                word_emb_dict['<unk>'] = np.random.random(size=word_emb_dim)
            if self.input_dicts['word'].with_pad:
                word_emb_dict['<pad>'] = np.random.random(size=word_emb_dim)

            word_emb_matrix = []
            unknown_word_count = 0
            scale = np.sqrt(3.0 / word_emb_dim)
            for i in range(self.input_dicts['word'].cell_num()):
                single_word = self.input_dicts['word'].id_cell_map[i]
                if single_word in word_emb_dict:
                    word_emb_matrix.append(word_emb_dict[single_word])
                elif single_word.lower() in word_emb_dict:
                    word_emb_matrix.append(word_emb_dict[single_word.lower()])
                else:
                    word_emb_matrix.append(np.random.uniform(-scale, scale, word_emb_dim))
                    unknown_word_count += 1
            word_emb_matrix = np.array(word_emb_matrix)
            logging.info("word embedding matrix shape:(%d, %d); unknown word count: %d;" %
                         (len(word_emb_matrix), len(word_emb_matrix[0]), unknown_word_count))
            logging.info("Word embedding loaded")

        return word_emb_matrix