def index_documents()

in agora/cerebral_api/src/indexer.py [0:0]


    def index_documents(self, directory_path: Optional[str] = None) -> bool:
        """
        Index documents from a directory with optimized chunking.
        
        Args:
            directory_path (Optional[str]): Path to directory containing documents
            
        Returns:
            bool: True if indexing was successful, False otherwise
        """
        try:
            directory_path = directory_path or DOCUMENTS_PATH
            if not directory_path or not os.path.exists(directory_path):
                raise ValueError(f"Invalid directory path: {directory_path}")

            logger.info(f"Starting document indexing from: {directory_path}")
            
            # Track progress
            processed_files = set()
            total_chunks = 0
            processed_count = 0

            # Process each file
            for root, _, files in os.walk(directory_path):
                for file in files:
                    if file.endswith(('.pdf', '.txt', '.md')):
                        file_path = os.path.join(root, file)
                        
                        # Skip if already processed
                        if file_path in processed_files:
                            continue
                        
                        try:
                            # Process document
                            texts = self.process_document(file_path)
                            
                            if texts:
                                # Prepare data for ChromaDB with minimal metadata
                                documents = [text.page_content for text in texts]
                                metadatas = [{"source": file_path, "page": i} for i in range(len(texts))]
                                
                                # Generate simple IDs
                                ids = [f"{os.path.basename(file_path)}_{i}" for i in range(len(texts))]
                                
                                # Add to ChromaDB
                                self.collection.add(
                                    documents=documents,
                                    metadatas=metadatas,
                                    ids=ids
                                )
                                
                                # Update statistics
                                total_chunks += len(texts)
                                processed_count += 1
                                processed_files.add(file_path)
                                
                                if VERBOSE:
                                    logger.debug(f"Indexed {len(texts)} chunks from {file_path}")
                            else:
                                logger.warning(f"No content extracted from: {file_path}")
                                
                        except Exception as e:
                            logger.error(f"Error processing {file_path}: {str(e)}")
                            continue

            logger.info(f"Indexing completed: {processed_count} files, {total_chunks} chunks")
            return total_chunks > 0
            
        except Exception as e:
            logger.error(f"Error during indexing: {str(e)}")
            return False