def indexDocument()

in source/lambda/helper/python/kendraHelper.py [0:0]


    def indexDocument(self,
                      kendraIndexId,
                      kendraRoleArn,
                      s3bucket,
                      s3key,
                      documentId,
                      documentExtension,
                      tag = 'everybody'):

    
        # try to fetch the optional kendra policy file that may have been uploaded to s3
        # along with the document
        originalDocumentName = s3key[:SEARCHABLE_PDF_SUFFIX_LENGTH].split('/')[DOCUMENT_KEY_FILENAME_INDEX]
        policyFilepath = "public/" + documentId + "/" + originalDocumentName + "." + documentExtension + ".metadata.json"
        s3helper = S3Helper()
        policyData = None
        
        try:
            policyData = s3helper.readFromS3(s3bucket,
                                             policyFilepath,
                                             os.environ['AWS_REGION'])
    
        # the normal case of a file not provided is handled.  If any other error
        # occur the indexing will proceed without the membership tags in the policy file
        except ClientError as e:
            policyData = None
            # NoSuchKey is the expected exception, any other means an error
            if e.response['Error']['Code'] == 'NoSuchKey':
                print("No kendra policy file found, only default membership will be applied")
            else:
                print("ClientError exception from s3helper.readFromS3: " + str(e))
                    
        # an error that should be investigated
        except Exception as e:
            policyData = None
            print("unspecified exception from s3helper.readFromS3: " + str(e))
        
        # accessControlList will contain the default persona membership, the function call provided
        # tags if different from default, and any additonal membership tags in the metadata policy
        # json file was given in s3 with the document
        accessControlList = []
        
        # the default membership for all documents
        defaultMembership = {}
        defaultMembership['Name'] = 'everybody'
        defaultMembership['Type'] = 'GROUP'
        defaultMembership['Access'] = 'ALLOW'
        accessControlList.append(defaultMembership)
        
        # if a different membership tag was provided in the function call, add it
        # as well
        if tag != 'everybody':
            tagMembership['Name'] = tag
            tagMembership['Type'] = 'GROUP'
            tagMembership['Access'] = 'ALLOW'
            accessControlList.append(tagMembership)
        
        # if the policy file exists, it may contain additional membership tags.  Parsing
        # error may happen and will be caught
        
        documentTitle = None
        
        try:
            if policyData != None:
                
                policy = json.loads(policyData)
                
                if 'Title' in policy:
                    documentTitle = policy['Title']
                
                for membership in policy['AccessControlList']:
                    
                    # no need for tags in the policy that may have been already added above
                    if membership['Name'] != 'everybody' and membership['Name'] != tag:
                        accessControlList.append(membership)
        
        # indexing will proceed without the membership tags in the policy file
        except Exception as e:
            print("Exception while processing policy file " + policyFilepath + str(e))
    
        print('Document {} will have the following membership policy in Kendra:{}'.format(documentId, json.dumps(accessControlList)))
    
        # get Kendra to index the document along with memberships
        document = {}
        document['Id'] = documentId
        document['AccessControlList'] = accessControlList
        document['ContentType'] = 'PDF'
        s3Path = {}
        s3Path['Bucket'] = s3bucket
        s3Path['Key'] = s3key
        document['S3Path'] = s3Path
        
        if documentTitle != None:
            document['Title'] = documentTitle
        
        kendraclient = client = boto3.client('kendra', region_name=os.environ['AWS_REGION'])
        
        response = client.batch_put_document(IndexId=kendraIndexId,
                                             RoleArn=kendraRoleArn,
                                             Documents=[document])

        return