cohere_vector/_tools/parse_queries.py (13 lines of code) (raw):
import json
from datasets import load_dataset
DATASET_NAME: str = f"Cohere/miracl-en-queries-22-12"
DATASET_SPLIT: str = "train"
OUTPUT_FILENAME: str = "queries.json"
def output_queries(queries_file):
queries = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
for query in queries:
queries_file.write(json.dumps(query["emb"]))
queries_file.write("\n")
if __name__ == "__main__":
with open(OUTPUT_FILENAME, "w") as queries_file:
output_queries(queries_file)