in mico/dataloader/query_doc_pair.py [0:0]
def __getitem__(self, idx):
"""We fetch the sample (from train/val/test dataset) from the CSV file.
If it is training or validation dataset, we use block-wise mapping to find the line number.
Parameters
----------
idx : int
Which sample we are fetching.
Returns
-------
parsed_list : list
If is_get_all_info is True, return a list of string: [query, ID, document, click, purchase]
If is_get_all_info is False (by default), return a list of strings: [query, document]
"""
if not self.is_test_data:
block_idx = idx // self.train_val_subblock_size
inbloack_idx = idx % self.train_val_subblock_size
block_idx = self.idx_mapping[block_idx]
idx = block_idx * self.train_val_subblock_size + inbloack_idx
idx += self._is_csv_header # the CSV header is not a sample
offset_idx = idx // self.offset_save_per_line
offset = self.offset_data[offset_idx]
try:
with open(self._filename, 'r') as csv_file:
csv_file.seek(offset)
for _ in range(1 + idx % self.offset_save_per_line):
line = csv_file.readline()
line = line.replace('\0','')
csv_line = csv.reader([line], **self.csv_reader_setting)
parsed_list = next(csv_line) # [query, ID, document, click, purchase]
if self.is_get_all_info:
return parsed_list
else:
return [parsed_list[0], parsed_list[2]]
except: # This is for quick inspection about which part goes wrong.
error_message = "Something wrong when reading CSV samples.\n Details (filename, line index): {}, \t {}".format(self._filename, idx)
raise IOError(error_message)