in src/models/rag.py [0:0]
def __init__(self, chunk_list: List[Dict[str, str]] = [],
storage_path: str = './data/textbooks/rag_storage',
emb_model_path: str = "local:../../models/rag_embedding/bge-m3",
chunk_size: int = 1024, similarity_top_k: int = 3, hybrid_search: bool = False,
reranker_path: Optional[str] = None, rerank_top_n: int = 3,
**kwargs):
os.makedirs(storage_path, exist_ok=True)
if len(os.listdir(storage_path)) == 0:
assert len(chunk_list) > 0
documents = [Document(text=chunk['data'], doc_id=chunk['idx']) for chunk in chunk_list]
node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=0)
nodes = node_parser.get_nodes_from_documents(documents)
self.index = VectorStoreIndex(nodes, embed_model=emb_model_path, show_progress=True, **kwargs)
self.index.storage_context.persist(storage_path)
else:
print('Loading LlamaIndex Storage ...')
t0 = time()
storage_context = StorageContext.from_defaults(persist_dir=storage_path)
self.index = load_index_from_storage(storage_context, embed_model=emb_model_path)
print(f'Done in {time() - t0:.1} seconds.')
self.retriever: VectorIndexRetriever = self.index.as_retriever(similarity_top_k=similarity_top_k)
if hybrid_search:
nodes = list(self.index.storage_context.docstore.docs.values())
self.keyword_index = SimpleKeywordTableIndex(nodes, show_progress=True)
self.keyword_retriever: KeywordTableSimpleRetriever = \
self.keyword_index.as_retriever(num_chunks_per_query=similarity_top_k)
else:
self.keyword_retriever = None
if reranker_path:
self.rerank = SentenceTransformerRerank(top_n=rerank_top_n, model=reranker_path)
else:
self.rerank = None