in lambda-email-parser/lambda_function.py [0:0]
def lambda_handler(event, context):
logger.error(json.dumps(event))
destination_bucket = os.environ.get('destination_bucket')
key_prefix = None
if not destination_bucket:
logger.error("Environment variable missing: destination_bucket")
return
# keep track of how many MIME parts are parsed and saved to S3
saved_parts = 0
msg = None
parts = None
workmail_mutate = None
# event is from workmail
if event.get('messageId'):
message_id = event['messageId']
key_prefix = message_id
raw_msg = workmail_message_flow.get_raw_message_content(messageId=message_id)
msg = email.message_from_bytes(raw_msg['messageContent'].read())
if os.environ.get('modify_workmail_message'):
workmail_mutate = True
# event is from s3
else:
records = event.get('Records', [])
record = records[0]
# TODO: for record in records:
# get the S3 object information
s3_info = record['s3']
object_info = s3_info['object']
if s3_info['bucket']['name'] == destination_bucket:
logger.error("To prevent recursive file creation this function will not write back to the same bucket")
return {
'statusCode': 400,
'body': 'To prevent recursive file creation this function will not write back to the same bucket'
}
# get the email message stored in S3 and parse it using the python email library
# TODO: error condition - if the file isn't an email message or doesn't parse correctly
fileObj, object_key = [None] * 2
object_key = object_info['key']
key_prefix = object_key
fileObj = s3.get_object(Bucket = s3_info['bucket']['name'], Key = object_key)
msg = email.message_from_bytes(fileObj['Body'].read())
# save the headers of the message to the bucket
headers_to_save = None
# By default saving all headers, but use environment vairables to be more specific
if os.environ.get('select_headers','ALL'):
headers_to_save = re.split(',\s*', str(os.environ.get('select_headers', 'ALL')))
all_headers = msg.items()
if "ALL" in headers_to_save:
s3.put_object(Bucket = destination_bucket, Key = key_prefix + "/headers.json", Body = json.dumps(all_headers))
elif len(headers_to_save) > 0:
saved_headers = []
i = 0
while i < len(all_headers):
this_header = all_headers[i]
if this_header[0].upper() in (header.upper() for header in headers_to_save):
saved_headers.append(this_header)
i += 1
s3.put_object(Bucket = destination_bucket, Key = key_prefix + "/headers.json", Body = json.dumps(saved_headers))
# parse the mime parts out of the message
parts = msg.walk()
# walk through each MIME part from the email message
part_idx = 0
for part in parts:
part_idx += 1
# get information about the MIME part
content_type, content_disposition, content, charset, filename = [None] * 5
content_type = part.get_content_type()
content_disposition = str(part.get_content_disposition())
content = part.get_payload(decode=True)
charset = part.get_content_charset()
filename = part.get_filename()
logger.error(f"Part: {part_idx}. Content charset: {charset}. Content type: {content_type}. Content disposition: {content_disposition}. Filename: {filename}");
# make file name for body, and untitled text or html parts
# add additional content types that we want to support non-existent filenames
if not filename:
if content_type == 'text/plain':
if 'attachment' not in content_disposition:
filename = "body.txt"
else:
filename = "untitled.txt"
elif content_type == 'text/html':
if 'attachment' not in content_disposition:
filename = "body.html"
else:
filename = "untitled.html"
else:
filename = "untitled"
# TODO: consider overriding or sanitizing the filenames since that is tainted data and might be subject to abuse in object key names
# technically, the entire message is tainted data, so it would be the responsibility of downstream parsers to ensure protection from interpreter abuse
# skip parts that aren't attachment parts
if content_type in ["multipart/mixed", "multipart/related", "multipart/alternative"]:
continue
if content:
# decode the content based on the character set specified
# TODO: add error handling
if charset:
content = content.decode(charset)
# store the decoded MIME part in S3 with the filename appended to the object key
s3.put_object(Bucket = destination_bucket, Key = key_prefix + "/mimepart" + str(part_idx) + "_" + filename, Body = content)
saved_parts += 1
else:
logger.error(f"Part {part_idx} has no content. Content type: {content_type}. Content disposition: {content_disposition}.");
if workmail_mutate:
email_subject = event['subject']
modified_object_key = key_prefix + "/" + str(uuid.uuid4())
new_subject = f"[PROCESSED] {email_subject}"
msg.replace_header('Subject', new_subject)
msg.add_header('X-AWS-Mailsploder-Bucket-Prefix', "s3://" + destination_bucket + "/" + key_prefix)
msg.add_header('X-AWS-Mailsploder-Parts-Saved', str(saved_parts))
# Store updated email in S3
s3.put_object(Bucket = destination_bucket, Key = modified_object_key, Body = msg.as_bytes())
# Update the email in WorkMail
s3_reference = {
'bucket': destination_bucket,
'key': modified_object_key
}
content = {
's3Reference': s3_reference
}
workmail_message_flow.put_raw_message_content(messageId=message_id, content=content)
return {
'statusCode': 200,
'body': 'Number of parts saved to S3 bucket: ' + destination_bucket + ': ' + str(saved_parts)
}