def indexDocument()

in source/lambda/textractor/python/og.py [0:0]


    def indexDocument(self, text, entitiesToIndex):
        
        if(self.elasticsearchDomain):

            host = self.elasticsearchDomain

            if(text):
                service = 'es'
                ss = boto3.Session()
                credentials = ss.get_credentials()
                region = ss.region_name

                awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                                region, service, session_token=credentials.token)

                es = Elasticsearch(
                    hosts=[{'host': host, 'port': 443}],
                    http_auth=awsauth,
                    use_ssl=True,
                    verify_certs=True,
                    connection_class=RequestsHttpConnection
                )

                es_index_client = client.IndicesClient(es)

                document = {
                    "documentId": "{}".format(self.documentId),
                    "name": "{}".format(self.objectName),
                    "bucket": "{}".format(self.bucketName),
                    "content": text
                }

                # add comprehend entities while indexing the document
                if entitiesToIndex:
                    for key, val in entitiesToIndex.items():
                        key = key.lower()
                        if(key == "date"):
                            for date in val:
                                date_object = format_date(date)
                                if(date_object!= UNSUPPORTED_DATE_FORMAT):
                                    if(key not in document):
                                        document[key] = []
                                    document[key].append(date_object.strftime("%Y-%m-%d"))
                            print("Document with Converted dates: {}".format(document))
                        else:
                            document[key] = val
                    
                try:
                    if not es_index_client.exists(index='textract'):
                        print("Index 'textract' does not exist, creating...")
                        es_index_client.create(
                            index="textract",
                            body={
                                "settings": {
                                    "index": {
                                        "number_of_shards": 2
                                    }
                                },
                                "mappings":{
                                    "properties":{
                                    "date":{
                                        "type": "date",
                                        "format": "M'/'dd'/'yyyy||date||year||year_month||dd MMM yyyy||dd'/'MM'/'yyyy||yyyy'/'MM'/'dd||dd'/'MM'/'YY||year_month_day||MM'/'dd'/'yy||dd MMM||MM'/'yyyy||M-dd-yyyy||MM'/'dd'/'yyyy||M||d'/'MM'/'yyyy||MM'/'dd'/'yy"
                                    }
                                }
                            }
                        }
                    )

                    es.index(index="textract", id=self.documentId, body=document)

                    print("Indexed document: {}".format(self.objectName))
                except Exception as E:
                    print("Failed to create index with desired mapping {}".format(E))
        else:
            print("Document not indexed {}".format(self.elasticsearchDomain))