def generate_json_for_csv_line_offset()

in mico/dataloader/query_doc_pair.py [0:0]


def generate_json_for_csv_line_offset(csv_filepath, save_per_line=1):
    """This is a function for caching the offset, so we can efficient find a random line in the CSV file.
    We save the offset data using `json` format on disk and `np.array` format in memory.
    The first number in the `json` file is the total line number of the CSV. The rest numbers are offsets.

    On the disk, if a CSV file is named `example.csv`, the offset file will be saved in the same folder,
        and named as `example.csv.offset_per_*.json` where `*` is the `save_per_line` parameter.

    Parameters
    ----------
    csv_filepath : string
        The path to the CSV file which we are calculating the offset on. 
    save_per_line : int
        If it is larger than 1, we only save the offset per `save_per_line` line. 
        This will increase the reading time a little, but save a lot of memory usage for the offset.

    Returns
    -------
    offset_idx : int
        This is the total line number of the CSV which is also the sample size of this file.
    offset_data : numpy.array
        This is the offset data.
    """
    if not os.path.isfile(csv_filepath):
        raise ValueError("CSV File %s does not exist" % csv_filepath)
    offset_json_filepath = csv_filepath + '.offset_per' + str(save_per_line) + '.json'
    if os.path.isfile(offset_json_filepath):
        # Offset files already exists, just read it
        with open(offset_json_filepath, 'r') as f:
            offset_data = json.load(f) 
        return offset_data[0], np.array(offset_data[1:])
    offset_loc = 0
    offset_idx = 0
    offset_data = [offset_loc]
    with open(csv_filepath, 'rb') as csv_file:
        for line in csv_file:
            offset_idx += 1
            offset_loc += len(line)
            if offset_idx % save_per_line == 0:
                offset_data.append(offset_loc)
    
    offset_data = [offset_idx] + offset_data
    with open(offset_json_filepath, 'w+') as f:
        json.dump(offset_data, f, indent='\t')
    return offset_idx, np.array(offset_data[1:])