def initialize_and_validate_chroma()

in agora/cerebral_api/src/app.py [0:0]


def initialize_and_validate_chroma() -> bool:
    """
    Initialize ChromaDB, create index if needed, and validate document count.
    Returns True if the index exists and contains documents.
    """
    try:
        from indexer import DocumentIndexer
        
        logger.info("Starting ChromaDB initialization and validation...")
        
        # Initialize DocumentIndexer
        indexer = DocumentIndexer()
        
        if VERBOSE:
            logger.debug("DocumentIndexer initialized successfully")
            
        # Check if collection exists and has documents
        try:
            count = indexer.collection.count()
            
            if count == 0:
                logger.warning(f"ChromaDB collection '{indexer.collection_name}' exists but contains no documents")
                
                if not DOCUMENTS_PATH:
                    logger.error("DOCUMENTS_PATH environment variable is not set")
                    return False
                
                if not os.path.exists(DOCUMENTS_PATH):
                    logger.error(f"Documents directory does not exist: {DOCUMENTS_PATH}")
                    return False
                
                logger.info(f"Starting document indexing from {DOCUMENTS_PATH}")
                
                # Index documents
                try:
                    indexer.index_documents(DOCUMENTS_PATH)
                    
                    # Verify documents were indexed
                    new_count = indexer.collection.count()
                    if new_count > 0:
                        logger.info(f"Successfully indexed {new_count} documents")
                        if VERBOSE:
                            try:
                                # Get a sample using a simple query
                                sample_query = "Provide a sample of the documentation"
                                results = indexer.collection.query(
                                    query_texts=[sample_query],
                                    n_results=min(5, new_count),
                                    include=['documents', 'metadatas']
                                )
                                logger.debug("Sample of indexed documents:")
                                if results['documents'] and results['documents'][0]:
                                    logger.debug(f"- Number of documents retrieved: {len(results['documents'][0])}")
                                    if results['metadatas'] and results['metadatas'][0]:
                                        sources = [meta.get('source', 'Unknown') for meta in results['metadatas'][0]]
                                        logger.debug(f"- Document sources: {sources}")
                            except Exception as e:
                                logger.debug(f"Sample retrieval skipped: {str(e)}")
                        return True
                    else:
                        logger.error("No documents were indexed")
                        return False
                        
                except Exception as e:
                    logger.error(f"Error during document indexing: {str(e)}")
                    return False
            else:
                logger.info(f"ChromaDB collection '{indexer.collection_name}' contains {count} documents")
                if VERBOSE:
                    try:
                        # Get a sample using a simple query
                        sample_query = "Provide a sample of the documentation"
                        results = indexer.collection.query(
                            query_texts=[sample_query],
                            n_results=min(5, count),
                            include=['documents', 'metadatas']
                        )
                        logger.debug("Sample of existing documents:")
                        if results['documents'] and results['documents'][0]:
                            logger.debug(f"- Number of documents retrieved: {len(results['documents'][0])}")
                            if results['metadatas'] and results['metadatas'][0]:
                                sources = [meta.get('source', 'Unknown') for meta in results['metadatas'][0]]
                                logger.debug(f"- Document sources: {sources}")
                    except Exception as e:
                        logger.debug(f"Sample retrieval skipped: {str(e)}")
                return True

        except Exception as e:
            logger.error(f"Error checking ChromaDB collection: {str(e)}")
            return False

    except Exception as e:
        logger.error(f"Error initializing ChromaDB: {str(e)}")
        return False