def __init__()

in mico/dataloader/query_doc_pair.py [0:0]


    def __init__(self, filename, val_ratio=0, val_indices=None, is_csv_header=True, is_get_all_info=False):
        """We generate or load the offset data of the CSV file (if it exists). 
        If we are generating training or validation data, we use blocks to split them and create a mapping 
            for reading the raw CSV rows as training or validation samples.
        
        Parameters
        ----------
        filename : string
            The path to the CSV file.
            It should be in the format [query, ID, doc, click, purchase]
        val_ratio : float
            How much of the training data will be in the validation dataset.
            The rest will be put in training dataset.
            If this is set, we know that we are generating the training dataset instead of validation or testing.
        val_indices : numpy.array
            Which sample in the data is used as validation data. 
            If this is set, we know that we are generating the validation dataset instead of training or testing.
        is_csv_header : bool
            When reading CSV files as input, set `True` if they have headers, so we can skip the first line.
        is_get_all_info : bool
            Set this to be True if we are running evaluation of our model on test dataset to check its performance.

        """
        self._filename = filename
        self.is_get_all_info = is_get_all_info
        self.offset_save_per_line = 10 # a trade-off between the loading speed and the memory usage
        self._total_size, self.offset_data = generate_json_for_csv_line_offset(filename, save_per_line=self.offset_save_per_line)
        self._is_csv_header = int(is_csv_header)
        with open(self._filename, 'r') as csv_file:
            line = csv_file.readline().strip()
            expected_header = 'query, ID, doc, click, purchase'
            if line == expected_header:
                if not is_csv_header:
                    logging.info('The first line of the file "%s" \t is \t "%s".' % (filename, line))
                    logging.info('It seems like this CSV file has header. Please set --is_csv_header')
                    raise ValueError
            else:
                if is_csv_header:
                    logging.info('The first line of the file "%s" \t is \t "%s".' % (filename, line))
                    logging.info('It seems like this CSV file does not have header. Please do not set --is_csv_header')
                    raise ValueError

        self._total_size -= self._is_csv_header # the CSV header is not a sample
        self.csv_reader_setting = {'delimiter':",", 'quotechar':'"', 'doublequote':False, 'escapechar':'\\', 'skipinitialspace':True}
        self.train_val_subblock_size = 10
        if val_ratio != 0: # train dataset
            val_size = int(self._total_size * val_ratio)
            np.random.seed(0)
            total_block_num = self._total_size // self.train_val_subblock_size
            val_block_num = val_size // self.train_val_subblock_size
            remaining_num = self._total_size % self.train_val_subblock_size
            self.val_indices = np.random.choice(range(total_block_num), \
                                                size=val_block_num, replace=False)
            val_indices_set = set(self.val_indices)
            self.idx_mapping = []
            for index_on_train_file in range(total_block_num + int(remaining_num != 0)):
                if index_on_train_file not in val_indices_set:
                    self.idx_mapping.append(index_on_train_file)
            self.idx_mapping = np.array(self.idx_mapping)
            self._total_size = (len(self.idx_mapping) - int(remaining_num != 0)) * self.train_val_subblock_size + remaining_num
            self.is_test_data = False
        elif val_indices is not None: # val dataset
            self.idx_mapping = val_indices
            self._total_size = len(self.idx_mapping) * self.train_val_subblock_size
            self.is_test_data = False
        else: # test dataset
            self.is_test_data = True