def lambda_handler()

in lambda-email-parser/lambda_function.py [0:0]
102 lines of code
26 McCabe index (conditional complexity)

def lambda_handler(event, context):
   logger.error(json.dumps(event))
   destination_bucket = os.environ.get('destination_bucket')
   key_prefix = None
   if not destination_bucket:
      logger.error("Environment variable missing: destination_bucket")
      return
   
   # keep track of how many MIME parts are parsed and saved to S3
   saved_parts = 0
   msg = None
   parts = None
   workmail_mutate = None

   # event is from workmail
   if event.get('messageId'):
      message_id = event['messageId']
      key_prefix = message_id
      raw_msg = workmail_message_flow.get_raw_message_content(messageId=message_id)
      msg = email.message_from_bytes(raw_msg['messageContent'].read())
      if os.environ.get('modify_workmail_message'):
         workmail_mutate = True

   # event is from s3
   else:
      records = event.get('Records', [])
      record  = records[0]
      # TODO: for record in records:
      # get the S3 object information
      s3_info = record['s3']
      object_info = s3_info['object']
      if s3_info['bucket']['name'] == destination_bucket:
         logger.error("To prevent recursive file creation this function will not write back to the same bucket")
         return {
            'statusCode': 400,
            'body': 'To prevent recursive file creation this function will not write back to the same bucket'
         }
      
      # get the email message stored in S3 and parse it using the python email library
      # TODO: error condition - if the file isn't an email message or doesn't parse correctly
      fileObj, object_key = [None] * 2
      object_key = object_info['key']
      key_prefix = object_key
      fileObj = s3.get_object(Bucket = s3_info['bucket']['name'], Key = object_key)
      msg = email.message_from_bytes(fileObj['Body'].read())
   
   # save the headers of the message to the bucket
   headers_to_save = None
   # By default saving all headers, but use environment vairables to be more specific
   if os.environ.get('select_headers','ALL'): 
      headers_to_save = re.split(',\s*', str(os.environ.get('select_headers', 'ALL')))
      all_headers = msg.items()
      if "ALL" in headers_to_save:
         s3.put_object(Bucket = destination_bucket, Key = key_prefix + "/headers.json", Body = json.dumps(all_headers))
      elif len(headers_to_save) > 0:
         saved_headers = []
         i = 0
         while i < len(all_headers):
            this_header = all_headers[i]
            if this_header[0].upper() in (header.upper() for header in headers_to_save):
               saved_headers.append(this_header)
            i += 1
         s3.put_object(Bucket = destination_bucket, Key = key_prefix + "/headers.json", Body = json.dumps(saved_headers))
      
   # parse the mime parts out of the message
   parts = msg.walk()
   
   # walk through each MIME part from the email message
   part_idx = 0
   for part in parts:
      part_idx += 1
      
      # get information about the MIME part
      content_type, content_disposition, content, charset, filename = [None] * 5
      content_type = part.get_content_type()
      content_disposition = str(part.get_content_disposition())
      content = part.get_payload(decode=True)
      charset = part.get_content_charset()
      filename = part.get_filename()
      logger.error(f"Part: {part_idx}. Content charset: {charset}. Content type: {content_type}. Content disposition: {content_disposition}. Filename: {filename}");

      # make file name for body, and untitled text or html parts
      # add additional content types that we want to support non-existent filenames
      if not filename:
         if content_type == 'text/plain':
            if 'attachment' not in content_disposition:
               filename = "body.txt"
            else:
               filename = "untitled.txt"
         elif content_type == 'text/html':
            if 'attachment' not in content_disposition:
               filename = "body.html"
            else:
               filename = "untitled.html"
         else:
            filename = "untitled"
   
      # TODO: consider overriding or sanitizing the filenames since that is tainted data and might be subject to abuse in object key names
      # technically, the entire message is tainted data, so it would be the responsibility of downstream parsers to ensure protection from interpreter abuse

      # skip parts that aren't attachment parts
      if content_type in ["multipart/mixed", "multipart/related", "multipart/alternative"]:
         continue
      
      if content:
         
         # decode the content based on the character set specified
         # TODO: add error handling
         if charset:
            content = content.decode(charset)
         
         # store the decoded MIME part in S3 with the filename appended to the object key
         s3.put_object(Bucket = destination_bucket, Key = key_prefix + "/mimepart" + str(part_idx) + "_" + filename, Body = content)
         saved_parts += 1
            
      else:
         logger.error(f"Part {part_idx} has no content. Content type: {content_type}. Content disposition: {content_disposition}.");
   
   if workmail_mutate:
      email_subject = event['subject']
      modified_object_key = key_prefix + "/" + str(uuid.uuid4())
      new_subject =  f"[PROCESSED] {email_subject}"
      msg.replace_header('Subject', new_subject)
      msg.add_header('X-AWS-Mailsploder-Bucket-Prefix', "s3://" + destination_bucket + "/" + key_prefix)
      msg.add_header('X-AWS-Mailsploder-Parts-Saved', str(saved_parts))
      
      # Store updated email in S3
      s3.put_object(Bucket = destination_bucket, Key = modified_object_key, Body = msg.as_bytes())

      # Update the email in WorkMail
      s3_reference = {
         'bucket': destination_bucket,
         'key': modified_object_key
      }
      content = {
         's3Reference': s3_reference
      }
      workmail_message_flow.put_raw_message_content(messageId=message_id, content=content)
        
   return {
       'statusCode': 200,
       'body': 'Number of parts saved to S3 bucket: ' + destination_bucket + ': ' + str(saved_parts)
   }