in scripts/community-dataset/convert-mbox-to-solr-docs.py [0:0]
def convert_message_to_solr_doc(message, source_list):
solr_doc = dict()
solr_doc["id"] = str(uuid.uuid4())
solr_doc["from_s"] = message.get("From")
solr_doc["list_s"] = source_list
# List-Id, for whatever reason, is always 'dev.solr.apache.org', so omitting this for now
#if "List-Id" in message:
# solr_doc["mailing_list_s"] = message["List-Id"]
# 'To' might contain multiple addresses, separated by commas
sender_unsplit = message.get("To")
senders = [line.strip() for line in sender_unsplit.split(",")]
solr_doc["to_s"] = sender_unsplit
solr_doc["to_ss"] = senders
# Solr requires dates in a particular format
date_str_raw = message.get("Date").replace("(MST)", "").replace("(UTC)", "").replace("(CST)", "").replace("(EST)", "").strip()
try:
date_obj = datetime.strptime(date_str_raw, "%a, %d %b %Y %H:%M:%S %z")
except ValueError:
date_obj = datetime.strptime(date_str_raw, "%d %b %Y %H:%M:%S %z")
solr_doc["sent_dt"] = date_obj.strftime("%Y-%m-%dT%H:%M:%SZ")
solr_doc["date_bucket_month_s"] = to_monthly_bucket(date_obj)
solr_doc["date_bucket_quarter_s"] = to_quarterly_bucket(date_obj)
subject_raw = message.get("Subject")
subject_cleaned = subject_raw.lower()
if subject_cleaned.startswith("re: "):
subject_cleaned = subject_cleaned.replace("re: ", "", 1)
solr_doc["subject_raw_s"] = subject_raw
solr_doc["subject_raw_txt"] = subject_raw
solr_doc["subject_clean_s"] = subject_cleaned
solr_doc["subject_clean_txt"] = subject_cleaned
return solr_doc