def get_document()

in lambda/indexer/jobcomplete.py [0:0]


def get_document(dsId, indexId, s3url, item, text):
    bucket, key, file_name = parse_s3url(s3url)
    region = get_bucket_region(bucket)
    document = {
        "Id": s3url,
        "Title": file_name,
        "Attributes": [
            {
                "Key": "_data_source_id",
                "Value": {
                    "StringValue": dsId
                }
            },
            {
                "Key": "_data_source_sync_job_execution_id",
                "Value": {
                    "StringValue": item['sync_job_id']
                }
            },
            {
                "Key": "_source_uri",
                "Value": {
                    "StringValue": f"https://s3.{region}.amazonaws.com/{bucket}/{key}"
                }
            }
        ],
        "Blob": text
    }
    # merge metadata
    metadata = get_s3jsondata(item['metadata_url'])
    if metadata.get("DocumentId"):
        logger.error(f"Metadata may not override: DocumentId")
    if metadata.get("ContentType"):
        logger.error(f"Metadata may not override: ContentType") 
    if metadata.get("Title"):
        logger.info(f"Set 'Title' to: \"{metadata['Title']}\"")  
        document['Title'] = metadata['Title']
    if metadata.get("Attributes"):
        logger.info(f"Set 'Attributes'")
        metadata_attributes = get_metadata_attributes(metadata)
        document["Attributes"] += metadata_attributes
    if metadata.get("AccessControlList"):
        logger.info(f"Set 'AccessControlList'")
        document["AccessControlList"] = metadata['AccessControlList']
    return document