in components/llm_service/src/services/query/query_service.py [0:0]
def make_query_reference(q_engine: QueryEngine,
query_doc: QueryDocument,
doc_chunk: QueryDocumentChunk,
query_embeddings: List[Optional[List[float]]],
rank_sentences: bool = False) -> \
QueryReference:
"""
Make a single QueryReference object, with appropriate fields
for modality
Args:
q_engine: The QueryEngine object that was searched
query_doc: The QueryDocument object retreived from q_engine
doc_chunk: The QueryDocumentChunk object of the retrieved query_doc
query_embeddings: The embedding vector for the query prompt
Returns:
query_reference: The QueryReference object corresponding to doc_chunk
"""
# Get modality of document chunk, make lowercase
# If modality is None, set it equal to default value "text"
modality = doc_chunk.modality
if modality is None:
modality = "text"
modality = modality.casefold()
# Clean up text chunk
if modality=="text":
# Clean up text in document chunk.
clean_text = doc_chunk.clean_text
if not clean_text:
clean_text = text_helper.clean_text(doc_chunk.text)
# Pick out sentences from document chunk and rank them.
if rank_sentences:
# Assemble sentences from a document chunk. Currently it gets the
# sentences from the top-ranked document chunk.
sentences = doc_chunk.sentences
if not sentences or len(sentences) == 0:
sentences = text_helper.text_to_sentence_list(doc_chunk.text)
# Only update clean_text when sentences is not empty.
Logger.info(f"Processing {len(sentences)} sentences.")
if sentences and len(sentences) > 0:
top_sentences = get_top_relevant_sentences(
q_engine, query_embeddings, sentences,
expand_neighbors=2, highlight_top_sentence=True)
clean_text = " ".join(top_sentences)
# Clean up image chunk
elif modality=="image":
# TODO: Placeholder to fill with actual logic
pass
# Clean up video chunk
elif modality=="video":
# TODO: Placeholder to fill with actual logic
pass
# Clean up audio chunk
elif modality=="audio":
# TODO: Placeholder to fill with actual logic
pass
# Create dict to hold all fields of query_reference,
# depending on its modality
query_reference_dict = {}
# For chunk of any modality
query_reference_dict["query_engine_id"]=q_engine.id
query_reference_dict["query_engine"]=q_engine.name
query_reference_dict["document_id"]=query_doc.id
query_reference_dict["document_url"]=query_doc.doc_url
query_reference_dict["modality"]=modality
query_reference_dict["chunk_id"]=doc_chunk.id
query_reference_dict["linked_ids"]=doc_chunk.linked_ids
# For text chunk only
if modality=="text":
query_reference_dict["page"]=doc_chunk.page
query_reference_dict["document_text"]=clean_text
# For image chunk only
elif modality=="image":
query_reference_dict["page"]=doc_chunk.page
query_reference_dict["chunk_url"]=doc_chunk.chunk_url
# For video and audio chunks only
elif modality in {"video", "audio"}:
query_reference_dict["chunk_url"]=doc_chunk.chunk_url
query_reference_dict["timestamp_start"]=doc_chunk.timestamp_start
query_reference_dict["timestamp_stop"]=doc_chunk.timestamp_stop
# Create query_reference out of dict
query_reference = QueryReference.from_dict(query_reference_dict)
# Return query_reference
return query_reference