dense_vector/_tools/parse.py (22 lines of code) (raw):

#!/usr/bin/env python3 import json import struct import sys try: from tqdm import tqdm iterate = lambda i: tqdm(range(i)) except ModuleNotFoundError: print("Warning: [tqdm] package is not available and you won't be able to see progress.", file=sys.stderr) iterate = range dims = 96 num_vectors = 10000000 def to_json(f): f.read(4) # the total number of vectors f.read(4) # the vector dimension for i in iterate(num_vectors): vector = struct.unpack("f" * dims, f.read(dims * 4)) print(json.dumps({"vector": vector}, ensure_ascii=False)) if len(sys.argv) != 2: print(f"Error: No vectors file. Rerun using [{sys.argv[0]} /path/to/vectors.fbin].") sys.exit(1) with open(sys.argv[1], "rb") as f: to_json(f)