in fulltext_search/index_docs.py [0:0]
def insert_batch(batch):
ndjson = ""
index_name = f"fineweb{random.randint(0, 63)}"
for text, _id, url, language_score, token_count in zip(
batch["text"],
batch["id"],
batch["url"],
batch["language_score"],
batch["token_count"],
):
doc = {
"insert": {
"index": index_name,
"_id": _id.split(":")[-1].strip(">"),
"doc": {
"content": text,
"fw_id": _id.split(":")[-1].strip(">"),
"url": url,
"language_score": language_score,
"token_count": token_count,
},
}
}
ndjson += json.dumps(doc) + "\n"
response = None
while response is None:
try:
response = requests.post(
"http://127.0.0.1:9308/bulk",
headers={"Content-Type": "application/x-ndjson"},
data=ndjson,
)
except requests.exceptions.ConnectionError as e:
print(e, file=sys.stderr)
time.sleep(1)
pass
return {"response": [response.status_code]}