def convert_message_to_solr_doc()

in scripts/community-dataset/convert-mbox-to-solr-docs.py [0:0]


def convert_message_to_solr_doc(message, source_list):
    solr_doc = dict()
    solr_doc["id"] = str(uuid.uuid4())
    solr_doc["from_s"] = message.get("From")
    solr_doc["list_s"] = source_list

    # List-Id, for whatever reason, is always 'dev.solr.apache.org', so omitting this for now
    #if "List-Id" in message:
    #    solr_doc["mailing_list_s"] = message["List-Id"]

    # 'To' might contain multiple addresses, separated by commas
    sender_unsplit = message.get("To")
    senders = [line.strip() for line in sender_unsplit.split(",")]
    solr_doc["to_s"] = sender_unsplit
    solr_doc["to_ss"] = senders

    # Solr requires dates in a particular format
    date_str_raw = message.get("Date").replace("(MST)", "").replace("(UTC)", "").replace("(CST)", "").replace("(EST)", "").strip()
    try:
        date_obj = datetime.strptime(date_str_raw, "%a, %d %b %Y %H:%M:%S %z")
    except ValueError:
        date_obj = datetime.strptime(date_str_raw, "%d %b %Y %H:%M:%S %z")
    solr_doc["sent_dt"] = date_obj.strftime("%Y-%m-%dT%H:%M:%SZ")
    solr_doc["date_bucket_month_s"] = to_monthly_bucket(date_obj)
    solr_doc["date_bucket_quarter_s"] = to_quarterly_bucket(date_obj)

    subject_raw = message.get("Subject")
    subject_cleaned = subject_raw.lower()
    if subject_cleaned.startswith("re: "):
        subject_cleaned = subject_cleaned.replace("re: ", "", 1)
    solr_doc["subject_raw_s"] = subject_raw
    solr_doc["subject_raw_txt"] = subject_raw
    solr_doc["subject_clean_s"] = subject_cleaned
    solr_doc["subject_clean_txt"] = subject_cleaned

    return solr_doc