def _create_es_index()

in distant_supervision/ds_es_client.py [0:0]


    def _create_es_index(self):
        es_conf = self.es_conf
        es = ElasticsearchMagic.get_instance('singleton', hosts=[es_conf.hosts])

        # delete index if exists
        if es.indices.exists(index=es_conf.index_name):
            es.indices.delete(index=es_conf.index_name)

        settings = {
            "number_of_shards": 9,
            "number_of_replicas": 1,
            "similarity": {
                "default": {
                    "type": "BM25",
                    "k1": 0.1,  # default is 1.2. Value of 0.0 means that it only depends on IDF (not TF).
                    "b": 0.1,  # default is 0.75. Value of 0.0 disables length-normalization.
                }
            },
            "analysis": {
                "filter": {
                    "english_possessive_stemmer": {
                        "name": "possessive_english",
                        "type": "stemmer"
                    },
                    "english_stop": {
                        "stopwords": "_english_",
                        "type": "stop"
                    },
                    "kstem_stemmer": {
                        # kstem is less aggressive than porter, e.g. "dogs" => "dog" in porter, but not in kstem
                        "name": "light_english",
                        "type": "stemmer"
                    },
                    "english_porter_stemmer": {
                        "name": "english",  # porter, see StemmerTokenFilterFactory.java
                        "type": "stemmer"
                    }
                },
                "analyzer": {
                    "porter_eng_analyzer": {
                        # https://stackoverflow.com/questions/33945796/understanding-analyzers-filters-and-queries-in-elasticsearch
                        "filter": [
                            "standard",  # does nothing: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-tokenfilter.html
                            "asciifolding",
                            "english_possessive_stemmer",
                            "lowercase",
                            "english_stop",
                            "english_porter_stemmer"
                        ],
                        "tokenizer": "standard"
                    },
                    "kstem_eng_analyzer": {
                        "filter": [
                            "standard",
                            "asciifolding",
                            "english_possessive_stemmer",
                            "lowercase",
                            "english_stop",
                            "kstem_stemmer"
                        ],
                        "tokenizer": "standard"
                    },
                    "possessive_english_analyzer": {
                        # no stemming
                        "filter": [
                            "standard",
                            "asciifolding",
                            "english_possessive_stemmer",
                            "lowercase",
                            "english_stop",
                        ],
                        "tokenizer": "standard"
                    },
                    "standard_english_analyzer": {
                        "type": "standard",
                        "stopwords": "_english_"
                    },
                }
            }
        }

        mappings_for_analyzed_text_field = {
            "type": "text",
            "index": True,
            "analyzer": "porter_eng_analyzer",
            "fields": {
                "possessive": {"type": "text", "analyzer": "possessive_english_analyzer"},
                "kstem": {"type": "text", "analyzer": "kstem_eng_analyzer"},
            },
        }

        mappings = {
            "doc": {
                "properties": {
                    "entities": {
                        "type": "text",  # json string
                        "index": False,
                    },
                    "noun_chunks": {
                        "type": "text",  # json string
                        "index": False,
                    },
                    "article_title": {
                        "type": "keyword",
                        "index": False,
                    },
                    "article_id": {
                        "type": "integer",
                        "index": True,
                    },
                    "body": mappings_for_analyzed_text_field,
                    "body_with_title": mappings_for_analyzed_text_field,
                }
            }
        }

        es.indices.create(es_conf.index_name, body=dict(
            mappings=mappings,
            settings=settings))

        es.indices.open(es_conf.index_name)