def search()

in source/lambda/apiprocessor/search.py [0:0]


def search(request):

    host = request["elasticsearchDomain"]
    keyword = request["keyword"] if "keyword" in request else None
    documentId = request["documentId"] if "documentId" in request else None

    output = request

    # extract the dates that the user wants to query
    if 'date:' in keyword:
        date_range = re.findall(r'date:\'\[\d{4}-\d{2}-\d{2} TO \d{4}-\d{2}-\d{2}\]\'',keyword)
        # for cases where user has input a single date
        if len(date_range)< 1:
            date = re.findall(r'date:\'\[(\d{4}-\d{2}-\d{2})\]\'',keyword)
            if len(date)< 1:
                raise ValueError("Invalid Search. Date search should be in format date:'[YYYY-mm-dd TO YYYY-mm-dd]' or date:'[YYYY-mm-dd]'")
            elif len(date) == 1:
                date_from = date_to = datetime.datetime.strptime(date[0],"%Y-%m-%d")
                keyword = re.sub(r'date:\'\[\d{4}-\d{2}-\d{2}\]\'',str(date[0])+" TO "+str(date[0]),keyword)
            elif len(date) > 1 :
                raise ValueError("Searching for multiple dates at a time is not permitted. Use a date range instead.")
        elif len(date_range)==1:
            # for cases where a range of date is provided by user
            date_range = re.findall(r'date:\'\[(\d{4}-\d{2}-\d{2} TO \d{4}-\d{2}-\d{2})\]\'',keyword)[0]
            date_from = datetime.datetime.strptime(date_range.split(" TO ")[0],"%Y-%m-%d")
            date_to = datetime.datetime.strptime(date_range.split(" TO ")[1],"%Y-%m-%d")
        elif len(date_range) > 1:
            raise ValueError("Searching for multiple date ranges at a time is not permitted")


    if(keyword is not None):
        searchBody = {
                 "query" : {
                    "query_string": {
                        "query": keyword
                       }
                  },
                  "highlight" : {
                    "fields" : {
                        "content" : { "pre_tags" : [""], "post_tags" : [""] },
                      },
                      "fragment_size" : ES_HIGHLIGHT_FRAGMENT_SIZE,
                      "require_field_match": False
                 }
            }

        service = 'es'
        ss = boto3.Session()
        credentials = ss.get_credentials()
        region = ss.region_name

        awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                           region, service, session_token=credentials.token)

        es = Elasticsearch(
            hosts=[{'host': host, 'port': 443}],
            http_auth=awsauth,
            use_ssl=True,
            verify_certs=True,
            connection_class=RequestsHttpConnection
        )

        output = es.search(
            index='textract',
            body=searchBody,
            _source = True,
            filter_path=['hits.hits._id', 'hits.hits._source','hits.hits.highlight']
        )

        if("hits" in output):
            output = output["hits"]
            # subnested hits
            hits = output["hits"]
            results = []

            for hit in hits:
                id = hit["_id"]
                source = hit["_source"]
                date_match_count = 0
                # calculate only if date is present in the search query
                if "date" in keyword:
                    date_match_count = calculate_date_matches(source,date_to,date_from)
                # decide the match count and lines to be displayed based on whether content was
                # highlighted in the query or not
                if "highlight" in hit.keys():
                    content_match_count = len(hit["highlight"]["content"]) + date_match_count
                    lines = hit["highlight"]["content"]
                else:
                    content_match_count = date_match_count
                    lines = [source["content"][10:100]]
                obj = {
                    "documentId": id,
                    "name": source["name"],
                    "bucket": source["bucket"],
                    "count": content_match_count,
                    "lines": lines
                }
                results.append(obj)
            output = results
        return output