def main()

in fulltext_search/index_docs.py [0:0]


def main():
    sql_url = "http://127.0.0.1:9308/sql?mode=raw"

    print("Removing table", file=sys.stderr)
    while True:
        try:
            requests.post(sql_url, data={"query": "drop table if exists fineweb"})
            break
        except requests.exceptions.ConnectionError as e:
            print(e, file=sys.stderr)
            time.sleep(5)
            pass

    print("Creating table", file=sys.stderr)
    for i in range(64):
        response = requests.post(
            sql_url, data={"query": f"drop table if exists fineweb{i}"}
        )
        print(response.text, file=sys.stderr)
        local_query = f"create table fineweb{i}(content text, fw_id string, url string, language_score float, token_count int) charset_table='non_cjk' stopwords='en' morphology='stem_en'"
        response = requests.post(sql_url, data={"query": local_query})
        print(response.text, file=sys.stderr)

    distributed_query = "create table fineweb type='distributed'"
    for i in range(64):
        distributed_query += f" local='fineweb{i}'"
    response = requests.post(sql_url, data={"query": distributed_query})
    print(response.text, file=sys.stderr)

    for dump in ["CC-MAIN-2024-10", "CC-MAIN-2023-50"]:
        print("Loading dataset", file=sys.stderr)
        dataset = load_dataset(
            "HuggingFaceFW/fineweb",
            dump,
            split="train",
            num_proc=64,
            cache_dir="/scratch/cosmo/.cache",
        )
        dataset = dataset.select_columns(
            ["text", "id", "url", "language_score", "token_count"]
        )
        dataset = dataset.map(
            insert_batch,
            batched=True,
            batch_size=10000,
            remove_columns=["text", "id", "url", "language_score", "token_count"],
            num_proc=64,
        )
        for _ in dataset:
            pass

    time.sleep(30)
    for i in range(64):
        print(f"Optimizing table fineweb{i}", file=sys.stderr)
        response = requests.post(
            sql_url,
            data={"query": f"FLUSH TABLE fineweb{i}"},
            timeout=600,
        )
        print(response.text, file=sys.stderr)
        response = requests.post(
            sql_url,
            data={"query": f"OPTIMIZE TABLE fineweb{i} OPTION cutoff=16, sync=1"},
            timeout=600,
        )
        print(response.text, file=sys.stderr)
        response = requests.post(
            sql_url,
            data={"query": f"FREEZE fineweb{i}"},
            timeout=600,
        )
        print(response.text, file=sys.stderr)

    response = requests.post(
        "http://127.0.0.1:9308/search",
        data='{"index":"fineweb","query":{"match":{"*":"hello world"}}}',
    )
    print(response.text, file=sys.stderr)