openai_vector/_tools/parse_queries.py (23 lines of code) (raw):
#!/usr/bin/env python3
import bz2
import json
import sys
import typing
import pyarrow as pa
BATCH_SIZE: int = 1000
QUERY_COLUMN: str = "embedding"
OUTPUT_FILENAME: str = "queries.json.bz2"
def output_queries(input_filename: str, queries_file: typing.TextIO):
with pa.memory_map(input_filename, "rb") as source:
query_table = pa.ipc.open_stream(source).read_all()
for record_batch in query_table.to_batches(max_chunksize=BATCH_SIZE):
query_list = record_batch.column(QUERY_COLUMN)
for query in query_list:
queries_file.write(json.dumps(query.as_py()))
queries_file.write("\n")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: {} <input_file_path>".format(sys.argv[0]))
exit(1)
input_filename = sys.argv[1]
with bz2.open(OUTPUT_FILENAME, "wt") as queries_file:
output_queries(input_filename, queries_file)