in mico/dataloader/query_doc_pair.py [0:0]
def generate_json_for_csv_line_offset(csv_filepath, save_per_line=1):
"""This is a function for caching the offset, so we can efficient find a random line in the CSV file.
We save the offset data using `json` format on disk and `np.array` format in memory.
The first number in the `json` file is the total line number of the CSV. The rest numbers are offsets.
On the disk, if a CSV file is named `example.csv`, the offset file will be saved in the same folder,
and named as `example.csv.offset_per_*.json` where `*` is the `save_per_line` parameter.
Parameters
----------
csv_filepath : string
The path to the CSV file which we are calculating the offset on.
save_per_line : int
If it is larger than 1, we only save the offset per `save_per_line` line.
This will increase the reading time a little, but save a lot of memory usage for the offset.
Returns
-------
offset_idx : int
This is the total line number of the CSV which is also the sample size of this file.
offset_data : numpy.array
This is the offset data.
"""
if not os.path.isfile(csv_filepath):
raise ValueError("CSV File %s does not exist" % csv_filepath)
offset_json_filepath = csv_filepath + '.offset_per' + str(save_per_line) + '.json'
if os.path.isfile(offset_json_filepath):
# Offset files already exists, just read it
with open(offset_json_filepath, 'r') as f:
offset_data = json.load(f)
return offset_data[0], np.array(offset_data[1:])
offset_loc = 0
offset_idx = 0
offset_data = [offset_loc]
with open(csv_filepath, 'rb') as csv_file:
for line in csv_file:
offset_idx += 1
offset_loc += len(line)
if offset_idx % save_per_line == 0:
offset_data.append(offset_loc)
offset_data = [offset_idx] + offset_data
with open(offset_json_filepath, 'w+') as f:
json.dump(offset_data, f, indent='\t')
return offset_idx, np.array(offset_data[1:])