frontend/src/components/Answer/AnswerParser.tsx (64 lines of code) (raw):
// src/components/Answer/AnswerParser.tsx
// Import necessary modules and types
import { getCitationFilePath } from "../../api";
import DOMPurify from "dompurify";
import { marked } from "marked";
// Define the structure of the parsed answer
type HtmlParsedAnswer = {
answerHtml: string;
citations: string[];
followupQuestions: string[];
};
/**
* Removes citations from the text.
* Citations are assumed to be in the format [citation].
*
* @param text - The text from which to remove citations.
* @returns The text without citations.
*/
export function removeCitations(text: string): string {
return text.replace(/\[[^\]]*\]/g, "");
}
/**
* Parses the answer string into sanitized HTML, extracts citations,
* and identifies any follow-up questions.
*
* @param answer - The raw answer string containing Markdown and citations.
* @param showSources - Flag to determine if citations should be displayed.
* @param onCitationClicked - Callback for handling citation clicks.
* @returns An object containing the sanitized HTML, list of citations, and follow-up questions.
*/
export function parseAnswerToHtml(
answer: string,
showSources: boolean,
onCitationClicked: (citationFilePath: string, filename: string) => void
): HtmlParsedAnswer {
const citations: string[] = [];
const followupQuestions: string[] = [];
// 1. Extract any follow-up questions enclosed in << >> and remove them from the answer.
let parsedAnswer = answer.replace(/<<([^>>]+)>>/g, (_, content) => {
followupQuestions.push(content.trim());
return "";
});
// 2. Trim any whitespace from the end of the answer after removing follow-up questions.
parsedAnswer = parsedAnswer.trim();
let processedAnswer = parsedAnswer;
if (showSources) {
// 3. Replace citations with unique placeholders and collect them.
// Citations are assumed to be in the format [citation].
processedAnswer = processedAnswer.replace(/\[([^\]]+)\]/g, (_, citation) => {
const trimmedCitation = citation.trim();
if (!citations.includes(trimmedCitation)) {
citations.push(trimmedCitation);
}
const citationIndex = citations.indexOf(trimmedCitation) + 1;
// Use a unique placeholder to identify citations later.
return `CITATION_MARKER_${citationIndex}`;
});
} else {
// 4. If sources are not to be shown, remove citations entirely.
processedAnswer = removeCitations(processedAnswer);
}
let htmlContent: string;
if (showSources) {
// 5. Use marked.parse to parse the Markdown with citation placeholders to HTML.
htmlContent = marked.parse(processedAnswer) as string; // Type Assertion Added
htmlContent = htmlContent.replace(/<\/?p>/g, '');
// 6. Replace citation placeholders with actual HTML links.
// These links include data attributes to store citation information.
htmlContent = htmlContent.replace(/CITATION_MARKER_(\d+)/g, (_: string, index: string) => {
const citationIndex = parseInt(index, 10);
const citation = citations[citationIndex - 1];
const path = getCitationFilePath(citation);
// Return an anchor tag with data attributes and a unique class for event handling.
return `<a class="supContainer citation-link" title="${DOMPurify.sanitize(
citation
)}" data-citation="${DOMPurify.sanitize(citation)}" data-path="${DOMPurify.sanitize(
path
)}" href="#"><sup>${citationIndex}</sup></a>`;
});
} else {
// 7. If not showing sources, simply parse the Markdown to HTML.
htmlContent = marked.parse(processedAnswer) as string; // Type Assertion Added
htmlContent = htmlContent.replace(/<\/?p>/g, '');
}
// 8. Sanitize the HTML to prevent XSS attacks and allow specific tags and attributes.
const sanitizedHtml = DOMPurify.sanitize(htmlContent, {
ADD_TAGS: ["sup", "a"],
ADD_ATTR: ["class", "title", "data-citation", "data-path", "href"],
});
return {
answerHtml: sanitizedHtml,
citations,
followupQuestions,
};
}