in mico/dataloader/query_doc_pair.py [0:0]
def __init__(self, filename, val_ratio=0, val_indices=None, is_csv_header=True, is_get_all_info=False):
"""We generate or load the offset data of the CSV file (if it exists).
If we are generating training or validation data, we use blocks to split them and create a mapping
for reading the raw CSV rows as training or validation samples.
Parameters
----------
filename : string
The path to the CSV file.
It should be in the format [query, ID, doc, click, purchase]
val_ratio : float
How much of the training data will be in the validation dataset.
The rest will be put in training dataset.
If this is set, we know that we are generating the training dataset instead of validation or testing.
val_indices : numpy.array
Which sample in the data is used as validation data.
If this is set, we know that we are generating the validation dataset instead of training or testing.
is_csv_header : bool
When reading CSV files as input, set `True` if they have headers, so we can skip the first line.
is_get_all_info : bool
Set this to be True if we are running evaluation of our model on test dataset to check its performance.
"""
self._filename = filename
self.is_get_all_info = is_get_all_info
self.offset_save_per_line = 10 # a trade-off between the loading speed and the memory usage
self._total_size, self.offset_data = generate_json_for_csv_line_offset(filename, save_per_line=self.offset_save_per_line)
self._is_csv_header = int(is_csv_header)
with open(self._filename, 'r') as csv_file:
line = csv_file.readline().strip()
expected_header = 'query, ID, doc, click, purchase'
if line == expected_header:
if not is_csv_header:
logging.info('The first line of the file "%s" \t is \t "%s".' % (filename, line))
logging.info('It seems like this CSV file has header. Please set --is_csv_header')
raise ValueError
else:
if is_csv_header:
logging.info('The first line of the file "%s" \t is \t "%s".' % (filename, line))
logging.info('It seems like this CSV file does not have header. Please do not set --is_csv_header')
raise ValueError
self._total_size -= self._is_csv_header # the CSV header is not a sample
self.csv_reader_setting = {'delimiter':",", 'quotechar':'"', 'doublequote':False, 'escapechar':'\\', 'skipinitialspace':True}
self.train_val_subblock_size = 10
if val_ratio != 0: # train dataset
val_size = int(self._total_size * val_ratio)
np.random.seed(0)
total_block_num = self._total_size // self.train_val_subblock_size
val_block_num = val_size // self.train_val_subblock_size
remaining_num = self._total_size % self.train_val_subblock_size
self.val_indices = np.random.choice(range(total_block_num), \
size=val_block_num, replace=False)
val_indices_set = set(self.val_indices)
self.idx_mapping = []
for index_on_train_file in range(total_block_num + int(remaining_num != 0)):
if index_on_train_file not in val_indices_set:
self.idx_mapping.append(index_on_train_file)
self.idx_mapping = np.array(self.idx_mapping)
self._total_size = (len(self.idx_mapping) - int(remaining_num != 0)) * self.train_val_subblock_size + remaining_num
self.is_test_data = False
elif val_indices is not None: # val dataset
self.idx_mapping = val_indices
self._total_size = len(self.idx_mapping) * self.train_val_subblock_size
self.is_test_data = False
else: # test dataset
self.is_test_data = True