in longform-qa/lfqa_utils.py [0:0]
def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_kilt_snippets_100w"):
index_config = {
"settings": {
"number_of_shards": 1,
"analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
},
"mappings": {
"properties": {
"article_title": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
"section_title": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
"passage_text": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
}
},
}
es_client.indices.create(index=index_name, body=index_config)
number_of_docs = passages_dset.num_rows
progress = tqdm(unit="docs", total=number_of_docs)
successes = 0
def passage_generator():
for passage in passages_dset:
yield passage
# create the ES index
for ok, action in streaming_bulk(client=es_client, index=index_name, actions=passage_generator(),):
progress.update(1)
successes += ok
print("Indexed %d documents" % (successes,))