in source/lambda/textractor/python/og.py [0:0]
def indexDocument(self, text, entitiesToIndex):
if(self.elasticsearchDomain):
host = self.elasticsearchDomain
if(text):
service = 'es'
ss = boto3.Session()
credentials = ss.get_credentials()
region = ss.region_name
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
region, service, session_token=credentials.token)
es = Elasticsearch(
hosts=[{'host': host, 'port': 443}],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection
)
es_index_client = client.IndicesClient(es)
document = {
"documentId": "{}".format(self.documentId),
"name": "{}".format(self.objectName),
"bucket": "{}".format(self.bucketName),
"content": text
}
# add comprehend entities while indexing the document
if entitiesToIndex:
for key, val in entitiesToIndex.items():
key = key.lower()
if(key == "date"):
for date in val:
date_object = format_date(date)
if(date_object!= UNSUPPORTED_DATE_FORMAT):
if(key not in document):
document[key] = []
document[key].append(date_object.strftime("%Y-%m-%d"))
print("Document with Converted dates: {}".format(document))
else:
document[key] = val
try:
if not es_index_client.exists(index='textract'):
print("Index 'textract' does not exist, creating...")
es_index_client.create(
index="textract",
body={
"settings": {
"index": {
"number_of_shards": 2
}
},
"mappings":{
"properties":{
"date":{
"type": "date",
"format": "M'/'dd'/'yyyy||date||year||year_month||dd MMM yyyy||dd'/'MM'/'yyyy||yyyy'/'MM'/'dd||dd'/'MM'/'YY||year_month_day||MM'/'dd'/'yy||dd MMM||MM'/'yyyy||M-dd-yyyy||MM'/'dd'/'yyyy||M||d'/'MM'/'yyyy||MM'/'dd'/'yy"
}
}
}
}
)
es.index(index="textract", id=self.documentId, body=document)
print("Indexed document: {}".format(self.objectName))
except Exception as E:
print("Failed to create index with desired mapping {}".format(E))
else:
print("Document not indexed {}".format(self.elasticsearchDomain))