def make_es_index_snippets()

in longform-qa/lfqa_utils.py [0:0]


def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_kilt_snippets_100w"):
    index_config = {
        "settings": {
            "number_of_shards": 1,
            "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
        },
        "mappings": {
            "properties": {
                "article_title": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
                "section_title": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
                "passage_text": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
            }
        },
    }
    es_client.indices.create(index=index_name, body=index_config)
    number_of_docs = passages_dset.num_rows
    progress = tqdm(unit="docs", total=number_of_docs)
    successes = 0

    def passage_generator():
        for passage in passages_dset:
            yield passage

    # create the ES index
    for ok, action in streaming_bulk(client=es_client, index=index_name, actions=passage_generator(),):
        progress.update(1)
        successes += ok
    print("Indexed %d documents" % (successes,))