in community/document-readers/spring-ai-alibaba-starter-document-reader-mbox/src/main/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReader.java [167:309]
private Document parseMessage(String messageContent) {
Map<String, Object> metadata = new HashMap<>();
Map<String, String> headers = new HashMap<>();
StringBuilder content = new StringBuilder();
String[] lines = messageContent.split("\n");
boolean inHeaders = true;
String boundary = null;
boolean inHtmlPart = false;
boolean skipCurrentPart = false;
StringBuilder currentPart = new StringBuilder();
boolean foundValidHeaders = false;
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
if (inHeaders) {
if (line.trim().isEmpty()) {
inHeaders = false;
// Check if this is a multipart message
String contentType = headers.get("Content-Type");
if (contentType != null && contentType.contains("multipart")) {
Matcher m = BOUNDARY_PATTERN.matcher(contentType);
if (m.find()) {
boundary = m.group(1);
}
}
continue;
}
Matcher m = HEADER_PATTERN.matcher(line);
if (m.matches()) {
String name = m.group(1).trim();
String value = m.group(2).trim();
headers.put(name, value);
foundValidHeaders = true;
}
continue;
}
// Process message body
if (boundary != null) {
if (line.contains("--" + boundary)) {
// Process the previous part if it exists
if (!currentPart.isEmpty()) {
if (inHtmlPart && !skipCurrentPart) {
// Parse HTML content and set as current content
String parsedHtml = parseHtmlContent(currentPart.toString());
if (!parsedHtml.isEmpty()) {
content = new StringBuilder(parsedHtml);
}
}
else if (content.isEmpty() && !skipCurrentPart) {
content = currentPart;
}
}
currentPart.setLength(0);
inHtmlPart = false;
skipCurrentPart = false;
continue;
}
// Check content type of the part
if (line.startsWith("Content-Type:")) {
if (line.contains("text/html")) {
inHtmlPart = true;
skipCurrentPart = false;
}
else if (!line.contains("text/plain")) {
// Skip non-text parts
skipCurrentPart = true;
}
continue;
}
if (!skipCurrentPart) {
currentPart.append(line).append("\n");
}
}
else {
// For non-multipart messages
String contentType = headers.get("Content-Type");
if (contentType != null && contentType.contains("text/html")) {
// If it's an HTML message, collect all lines for parsing
content.append(line).append("\n");
if (i == lines.length - 1) {
// Parse the complete HTML content at the end
String parsedHtml = parseHtmlContent(content.toString());
if (!parsedHtml.isEmpty()) {
content = new StringBuilder(parsedHtml);
}
}
}
else {
content.append(line).append("\n");
}
}
}
// If no valid headers were found, this is not a valid message
if (!foundValidHeaders) {
logger.warn("No valid headers found in message");
return null;
}
// Extract metadata
metadata.put("subject", headers.getOrDefault("Subject", ""));
metadata.put("from", headers.getOrDefault("From", ""));
metadata.put("to", headers.getOrDefault("To", ""));
try {
String dateStr = headers.get("Date");
if (dateStr != null) {
metadata.put("date", dateFormat.parse(dateStr));
}
}
catch (ParseException e) {
throw new RuntimeException("Failed to parse date: " + e.getMessage(), e);
}
// Check if content is empty
String contentStr = content.toString().trim();
if (contentStr.isEmpty()) {
// Instead of throwing an exception, provide a default message
logger.warn("Empty content found for message: {}", headers.getOrDefault("Message-ID", "unknown"));
contentStr = "[No content available]";
}
// Format the content
String formattedContent = String.format(messageFormat, metadata.getOrDefault("date", ""), metadata.get("from"),
metadata.get("to"), metadata.get("subject"), contentStr);
// Check if formatted content is empty
if (formattedContent.trim().isEmpty()) {
// Instead of throwing an exception, provide a default formatted content
logger.warn("Empty formatted content for message: {}", headers.getOrDefault("Message-ID", "unknown"));
formattedContent = String.format("Empty email with ID: %s", headers.getOrDefault("Message-ID", "unknown"));
}
// Use Message-ID as document ID
String id = headers.getOrDefault("Message-ID", "msg-" + System.currentTimeMillis());
return new Document(id, formattedContent, metadata);
}