private Document parseMessage()

in community/document-readers/spring-ai-alibaba-starter-document-reader-mbox/src/main/java/com/alibaba/cloud/ai/reader/mbox/MboxDocumentReader.java [167:309]


	private Document parseMessage(String messageContent) {
		Map<String, Object> metadata = new HashMap<>();
		Map<String, String> headers = new HashMap<>();
		StringBuilder content = new StringBuilder();
		String[] lines = messageContent.split("\n");

		boolean inHeaders = true;
		String boundary = null;
		boolean inHtmlPart = false;
		boolean skipCurrentPart = false;
		StringBuilder currentPart = new StringBuilder();
		boolean foundValidHeaders = false;

		for (int i = 0; i < lines.length; i++) {
			String line = lines[i];

			if (inHeaders) {
				if (line.trim().isEmpty()) {
					inHeaders = false;
					// Check if this is a multipart message
					String contentType = headers.get("Content-Type");
					if (contentType != null && contentType.contains("multipart")) {
						Matcher m = BOUNDARY_PATTERN.matcher(contentType);
						if (m.find()) {
							boundary = m.group(1);
						}
					}
					continue;
				}

				Matcher m = HEADER_PATTERN.matcher(line);
				if (m.matches()) {
					String name = m.group(1).trim();
					String value = m.group(2).trim();
					headers.put(name, value);
					foundValidHeaders = true;
				}
				continue;
			}

			// Process message body
			if (boundary != null) {
				if (line.contains("--" + boundary)) {
					// Process the previous part if it exists
					if (!currentPart.isEmpty()) {
						if (inHtmlPart && !skipCurrentPart) {
							// Parse HTML content and set as current content
							String parsedHtml = parseHtmlContent(currentPart.toString());
							if (!parsedHtml.isEmpty()) {
								content = new StringBuilder(parsedHtml);
							}
						}
						else if (content.isEmpty() && !skipCurrentPart) {
							content = currentPart;
						}
					}
					currentPart.setLength(0);
					inHtmlPart = false;
					skipCurrentPart = false;
					continue;
				}

				// Check content type of the part
				if (line.startsWith("Content-Type:")) {
					if (line.contains("text/html")) {
						inHtmlPart = true;
						skipCurrentPart = false;
					}
					else if (!line.contains("text/plain")) {
						// Skip non-text parts
						skipCurrentPart = true;
					}
					continue;
				}

				if (!skipCurrentPart) {
					currentPart.append(line).append("\n");
				}
			}
			else {
				// For non-multipart messages
				String contentType = headers.get("Content-Type");
				if (contentType != null && contentType.contains("text/html")) {
					// If it's an HTML message, collect all lines for parsing
					content.append(line).append("\n");
					if (i == lines.length - 1) {
						// Parse the complete HTML content at the end
						String parsedHtml = parseHtmlContent(content.toString());
						if (!parsedHtml.isEmpty()) {
							content = new StringBuilder(parsedHtml);
						}
					}
				}
				else {
					content.append(line).append("\n");
				}
			}
		}

		// If no valid headers were found, this is not a valid message
		if (!foundValidHeaders) {
			logger.warn("No valid headers found in message");
			return null;
		}

		// Extract metadata
		metadata.put("subject", headers.getOrDefault("Subject", ""));
		metadata.put("from", headers.getOrDefault("From", ""));
		metadata.put("to", headers.getOrDefault("To", ""));
		try {
			String dateStr = headers.get("Date");
			if (dateStr != null) {
				metadata.put("date", dateFormat.parse(dateStr));
			}
		}
		catch (ParseException e) {
			throw new RuntimeException("Failed to parse date: " + e.getMessage(), e);
		}

		// Check if content is empty
		String contentStr = content.toString().trim();
		if (contentStr.isEmpty()) {
			// Instead of throwing an exception, provide a default message
			logger.warn("Empty content found for message: {}", headers.getOrDefault("Message-ID", "unknown"));
			contentStr = "[No content available]";
		}

		// Format the content
		String formattedContent = String.format(messageFormat, metadata.getOrDefault("date", ""), metadata.get("from"),
				metadata.get("to"), metadata.get("subject"), contentStr);

		// Check if formatted content is empty
		if (formattedContent.trim().isEmpty()) {
			// Instead of throwing an exception, provide a default formatted content
			logger.warn("Empty formatted content for message: {}", headers.getOrDefault("Message-ID", "unknown"));
			formattedContent = String.format("Empty email with ID: %s", headers.getOrDefault("Message-ID", "unknown"));
		}

		// Use Message-ID as document ID
		String id = headers.getOrDefault("Message-ID", "msg-" + System.currentTimeMillis());

		return new Document(id, formattedContent, metadata);
	}