scripts/community-dataset/convert-mbox-to-solr-docs.py (78 lines of code) (raw):

#!/usr/local/bin/python3 import mailbox import sys import os import uuid import json from datetime import datetime # Potential Improvements: # - more cleaning for 'subject' and other free text fields # - some regex parsing to separate sender name/email in 'from' fields # - capture other fields def convert_message_to_solr_doc(message, source_list): solr_doc = dict() solr_doc["id"] = str(uuid.uuid4()) solr_doc["from_s"] = message.get("From") solr_doc["list_s"] = source_list # List-Id, for whatever reason, is always 'dev.solr.apache.org', so omitting this for now #if "List-Id" in message: # solr_doc["mailing_list_s"] = message["List-Id"] # 'To' might contain multiple addresses, separated by commas sender_unsplit = message.get("To") senders = [line.strip() for line in sender_unsplit.split(",")] solr_doc["to_s"] = sender_unsplit solr_doc["to_ss"] = senders # Solr requires dates in a particular format date_str_raw = message.get("Date").replace("(MST)", "").replace("(UTC)", "").replace("(CST)", "").replace("(EST)", "").strip() try: date_obj = datetime.strptime(date_str_raw, "%a, %d %b %Y %H:%M:%S %z") except ValueError: date_obj = datetime.strptime(date_str_raw, "%d %b %Y %H:%M:%S %z") solr_doc["sent_dt"] = date_obj.strftime("%Y-%m-%dT%H:%M:%SZ") solr_doc["date_bucket_month_s"] = to_monthly_bucket(date_obj) solr_doc["date_bucket_quarter_s"] = to_quarterly_bucket(date_obj) subject_raw = message.get("Subject") subject_cleaned = subject_raw.lower() if subject_cleaned.startswith("re: "): subject_cleaned = subject_cleaned.replace("re: ", "", 1) solr_doc["subject_raw_s"] = subject_raw solr_doc["subject_raw_txt"] = subject_raw solr_doc["subject_clean_s"] = subject_cleaned solr_doc["subject_clean_txt"] = subject_cleaned return solr_doc def to_monthly_bucket(date_obj): padded_month = str(date_obj.month) if len(padded_month) == 1: padded_month = "0" + padded_month return str(date_obj.year) + "-" + padded_month # Returns a string representing the ASF fiscal quarter this email was sent in. (Useful for compiling quarterly reports!) # ASF Fiscal quarters are a bit odd. I don't understand them. But the logic appears to be, taking FY2020 as an example: # - Q1 of FY2020 is May, June, and July of 2019 # - Q2 of FY2020 is August, September, October of 2019 # - Q3 of FY2020 is November and December of 2019, and January of 2020 # - Q4 of FY2020 is February, March, and April of 2020 # Why would "Q1" start in May? Why would the FY and the calendar year be offset in this manner? :shrug: def to_quarterly_bucket(date_obj): month = date_obj.month year = date_obj.year if month >= 2 and month <= 4: quarter = "Q4" fiscal_year = year elif month >= 5 and month <= 7: quarter = "Q1" fiscal_year = year + 1 elif month >= 8 and month <= 10: quarter = "Q2" fiscal_year = year + 1 else: # month = 11, 12, 1 quarter = "Q3" if month == 1: fiscal_year = year else: fiscal_year = year + 1 return str(year) + "-" + quarter if __name__ == "__main__": if len(sys.argv) != 3: print("Incorrect arguments provided") print(" Usage: convert-mbox-to-solr-docs.py <mbox-file> <output-directory>") sys.exit(1) mbox_filepath = sys.argv[1] output_directory = sys.argv[2] # Filename is assumed to be in the form sourceList-YYYY-MM.mbox (e.g. builds-2023-5.mbox) mbox_filename = os.path.basename(mbox_filepath) source_list = mbox_filename.split("-")[0] solr_doc_filename = mbox_filename.replace(".mbox", ".json") solr_doc_filepath = os.path.join(output_directory, solr_doc_filename) with open(solr_doc_filepath, 'w') as solr_doc_writer: solr_doc_writer.write("[") first_doc = True for message in mailbox.mbox(mbox_filepath): if not first_doc: solr_doc_writer.write(",") first_doc = False solr_doc_writer.write("\n") solr_doc = convert_message_to_solr_doc(message, source_list) json.dump(solr_doc, solr_doc_writer) solr_doc_writer.write("]")