in agora/cerebral_api/src/app.py [0:0]
def initialize_and_validate_chroma() -> bool:
"""
Initialize ChromaDB, create index if needed, and validate document count.
Returns True if the index exists and contains documents.
"""
try:
from indexer import DocumentIndexer
logger.info("Starting ChromaDB initialization and validation...")
# Initialize DocumentIndexer
indexer = DocumentIndexer()
if VERBOSE:
logger.debug("DocumentIndexer initialized successfully")
# Check if collection exists and has documents
try:
count = indexer.collection.count()
if count == 0:
logger.warning(f"ChromaDB collection '{indexer.collection_name}' exists but contains no documents")
if not DOCUMENTS_PATH:
logger.error("DOCUMENTS_PATH environment variable is not set")
return False
if not os.path.exists(DOCUMENTS_PATH):
logger.error(f"Documents directory does not exist: {DOCUMENTS_PATH}")
return False
logger.info(f"Starting document indexing from {DOCUMENTS_PATH}")
# Index documents
try:
indexer.index_documents(DOCUMENTS_PATH)
# Verify documents were indexed
new_count = indexer.collection.count()
if new_count > 0:
logger.info(f"Successfully indexed {new_count} documents")
if VERBOSE:
try:
# Get a sample using a simple query
sample_query = "Provide a sample of the documentation"
results = indexer.collection.query(
query_texts=[sample_query],
n_results=min(5, new_count),
include=['documents', 'metadatas']
)
logger.debug("Sample of indexed documents:")
if results['documents'] and results['documents'][0]:
logger.debug(f"- Number of documents retrieved: {len(results['documents'][0])}")
if results['metadatas'] and results['metadatas'][0]:
sources = [meta.get('source', 'Unknown') for meta in results['metadatas'][0]]
logger.debug(f"- Document sources: {sources}")
except Exception as e:
logger.debug(f"Sample retrieval skipped: {str(e)}")
return True
else:
logger.error("No documents were indexed")
return False
except Exception as e:
logger.error(f"Error during document indexing: {str(e)}")
return False
else:
logger.info(f"ChromaDB collection '{indexer.collection_name}' contains {count} documents")
if VERBOSE:
try:
# Get a sample using a simple query
sample_query = "Provide a sample of the documentation"
results = indexer.collection.query(
query_texts=[sample_query],
n_results=min(5, count),
include=['documents', 'metadatas']
)
logger.debug("Sample of existing documents:")
if results['documents'] and results['documents'][0]:
logger.debug(f"- Number of documents retrieved: {len(results['documents'][0])}")
if results['metadatas'] and results['metadatas'][0]:
sources = [meta.get('source', 'Unknown') for meta in results['metadatas'][0]]
logger.debug(f"- Document sources: {sources}")
except Exception as e:
logger.debug(f"Sample retrieval skipped: {str(e)}")
return True
except Exception as e:
logger.error(f"Error checking ChromaDB collection: {str(e)}")
return False
except Exception as e:
logger.error(f"Error initializing ChromaDB: {str(e)}")
return False