in source/lambda/helper/python/kendraHelper.py [0:0]
def indexDocument(self,
kendraIndexId,
kendraRoleArn,
s3bucket,
s3key,
documentId,
documentExtension,
tag = 'everybody'):
# try to fetch the optional kendra policy file that may have been uploaded to s3
# along with the document
originalDocumentName = s3key[:SEARCHABLE_PDF_SUFFIX_LENGTH].split('/')[DOCUMENT_KEY_FILENAME_INDEX]
policyFilepath = "public/" + documentId + "/" + originalDocumentName + "." + documentExtension + ".metadata.json"
s3helper = S3Helper()
policyData = None
try:
policyData = s3helper.readFromS3(s3bucket,
policyFilepath,
os.environ['AWS_REGION'])
# the normal case of a file not provided is handled. If any other error
# occur the indexing will proceed without the membership tags in the policy file
except ClientError as e:
policyData = None
# NoSuchKey is the expected exception, any other means an error
if e.response['Error']['Code'] == 'NoSuchKey':
print("No kendra policy file found, only default membership will be applied")
else:
print("ClientError exception from s3helper.readFromS3: " + str(e))
# an error that should be investigated
except Exception as e:
policyData = None
print("unspecified exception from s3helper.readFromS3: " + str(e))
# accessControlList will contain the default persona membership, the function call provided
# tags if different from default, and any additonal membership tags in the metadata policy
# json file was given in s3 with the document
accessControlList = []
# the default membership for all documents
defaultMembership = {}
defaultMembership['Name'] = 'everybody'
defaultMembership['Type'] = 'GROUP'
defaultMembership['Access'] = 'ALLOW'
accessControlList.append(defaultMembership)
# if a different membership tag was provided in the function call, add it
# as well
if tag != 'everybody':
tagMembership['Name'] = tag
tagMembership['Type'] = 'GROUP'
tagMembership['Access'] = 'ALLOW'
accessControlList.append(tagMembership)
# if the policy file exists, it may contain additional membership tags. Parsing
# error may happen and will be caught
documentTitle = None
try:
if policyData != None:
policy = json.loads(policyData)
if 'Title' in policy:
documentTitle = policy['Title']
for membership in policy['AccessControlList']:
# no need for tags in the policy that may have been already added above
if membership['Name'] != 'everybody' and membership['Name'] != tag:
accessControlList.append(membership)
# indexing will proceed without the membership tags in the policy file
except Exception as e:
print("Exception while processing policy file " + policyFilepath + str(e))
print('Document {} will have the following membership policy in Kendra:{}'.format(documentId, json.dumps(accessControlList)))
# get Kendra to index the document along with memberships
document = {}
document['Id'] = documentId
document['AccessControlList'] = accessControlList
document['ContentType'] = 'PDF'
s3Path = {}
s3Path['Bucket'] = s3bucket
s3Path['Key'] = s3key
document['S3Path'] = s3Path
if documentTitle != None:
document['Title'] = documentTitle
kendraclient = client = boto3.client('kendra', region_name=os.environ['AWS_REGION'])
response = client.batch_put_document(IndexId=kendraIndexId,
RoleArn=kendraRoleArn,
Documents=[document])
return