in source/lambda/apiprocessor/search.py [0:0]
def search(request):
host = request["elasticsearchDomain"]
keyword = request["keyword"] if "keyword" in request else None
documentId = request["documentId"] if "documentId" in request else None
output = request
# extract the dates that the user wants to query
if 'date:' in keyword:
date_range = re.findall(r'date:\'\[\d{4}-\d{2}-\d{2} TO \d{4}-\d{2}-\d{2}\]\'',keyword)
# for cases where user has input a single date
if len(date_range)< 1:
date = re.findall(r'date:\'\[(\d{4}-\d{2}-\d{2})\]\'',keyword)
if len(date)< 1:
raise ValueError("Invalid Search. Date search should be in format date:'[YYYY-mm-dd TO YYYY-mm-dd]' or date:'[YYYY-mm-dd]'")
elif len(date) == 1:
date_from = date_to = datetime.datetime.strptime(date[0],"%Y-%m-%d")
keyword = re.sub(r'date:\'\[\d{4}-\d{2}-\d{2}\]\'',str(date[0])+" TO "+str(date[0]),keyword)
elif len(date) > 1 :
raise ValueError("Searching for multiple dates at a time is not permitted. Use a date range instead.")
elif len(date_range)==1:
# for cases where a range of date is provided by user
date_range = re.findall(r'date:\'\[(\d{4}-\d{2}-\d{2} TO \d{4}-\d{2}-\d{2})\]\'',keyword)[0]
date_from = datetime.datetime.strptime(date_range.split(" TO ")[0],"%Y-%m-%d")
date_to = datetime.datetime.strptime(date_range.split(" TO ")[1],"%Y-%m-%d")
elif len(date_range) > 1:
raise ValueError("Searching for multiple date ranges at a time is not permitted")
if(keyword is not None):
searchBody = {
"query" : {
"query_string": {
"query": keyword
}
},
"highlight" : {
"fields" : {
"content" : { "pre_tags" : [""], "post_tags" : [""] },
},
"fragment_size" : ES_HIGHLIGHT_FRAGMENT_SIZE,
"require_field_match": False
}
}
service = 'es'
ss = boto3.Session()
credentials = ss.get_credentials()
region = ss.region_name
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
region, service, session_token=credentials.token)
es = Elasticsearch(
hosts=[{'host': host, 'port': 443}],
http_auth=awsauth,
use_ssl=True,
verify_certs=True,
connection_class=RequestsHttpConnection
)
output = es.search(
index='textract',
body=searchBody,
_source = True,
filter_path=['hits.hits._id', 'hits.hits._source','hits.hits.highlight']
)
if("hits" in output):
output = output["hits"]
# subnested hits
hits = output["hits"]
results = []
for hit in hits:
id = hit["_id"]
source = hit["_source"]
date_match_count = 0
# calculate only if date is present in the search query
if "date" in keyword:
date_match_count = calculate_date_matches(source,date_to,date_from)
# decide the match count and lines to be displayed based on whether content was
# highlighted in the query or not
if "highlight" in hit.keys():
content_match_count = len(hit["highlight"]["content"]) + date_match_count
lines = hit["highlight"]["content"]
else:
content_match_count = date_match_count
lines = [source["content"][10:100]]
obj = {
"documentId": id,
"name": source["name"],
"bucket": source["bucket"],
"count": content_match_count,
"lines": lines
}
results.append(obj)
output = results
return output