code/embedding-function/utilities/parser/output_parser_tool.py (79 lines of code) (raw):
from typing import List
import logging
import re
import json
from .parser_base import ParserBase
from ..common.source_document import SourceDocument
logger = logging.getLogger(__name__)
class OutputParserTool(ParserBase):
def __init__(self) -> None:
self.name = "OutputParser"
def _clean_up_answer(self, answer):
return answer.replace(" ", " ")
def _get_source_docs_from_answer(self, answer):
# extract all [docN] from answer and extract N, and just return the N's as a list of ints
results = re.findall(r"\[doc(\d+)\]", answer)
return [int(i) for i in results]
def _make_doc_references_sequential(self, answer):
doc_matches = list(re.finditer(r"\[doc\d+\]", answer))
updated_answer = answer
offset = 0
for i, match in enumerate(doc_matches):
start, end = match.start() + offset, match.end() + offset
updated_answer = updated_answer[:start] + f"[doc{i + 1}]" + updated_answer[end:]
offset += len(f"[doc{i + 1}]") - (end - start)
return updated_answer
def parse(
self,
question: str,
answer: str,
source_documents: List[SourceDocument] = [],
**kwargs: dict,
) -> List[dict]:
logger.info("Method parse of output_parser_tool started")
answer = self._clean_up_answer(answer)
doc_ids = self._get_source_docs_from_answer(answer)
answer = self._make_doc_references_sequential(answer)
# create return message object
messages = [
{
"role": "tool",
"content": {"citations": [], "intent": question},
"end_turn": False,
}
]
for i in doc_ids:
idx = i - 1
if idx >= len(source_documents):
logger.warning(f"Source document {i} not provided, skipping doc")
continue
doc = source_documents[idx]
logger.debug(f"doc{idx}: {doc}")
# Then update the citation object in the response, it needs to have filepath and chunk_id to render in the UI as a file
messages[0]["content"]["citations"].append(
{
"content": doc.get_markdown_url() + "\n\n\n" + doc.content,
"id": doc.id,
"chunk_id": (
re.findall(r"\d+", doc.chunk_id)[-1]
if doc.chunk_id is not None
else doc.chunk
),
"title": doc.title,
"filepath": doc.get_filename(include_path=True),
"url": doc.get_markdown_url(),
"metadata": {
"offset": doc.offset,
"source": doc.source,
"markdown_url": doc.get_markdown_url(),
"title": doc.title,
"original_url": doc.source, # TODO: do we need this?
"chunk": doc.chunk,
"key": doc.id,
"filename": doc.get_filename(),
},
}
)
if messages[0]["content"]["citations"] == []:
answer = re.sub(r"\[doc\d+\]", "", answer)
messages.append({"role": "assistant", "content": answer, "end_turn": True})
# everything in content needs to be stringified to work with Azure BYOD frontend
messages[0]["content"] = json.dumps(messages[0]["content"])
logger.info("Method parse of output_parser_tool ended")
return messages