in agora/cerebral_api/src/indexer.py [0:0]
def index_documents(self, directory_path: Optional[str] = None) -> bool:
"""
Index documents from a directory with optimized chunking.
Args:
directory_path (Optional[str]): Path to directory containing documents
Returns:
bool: True if indexing was successful, False otherwise
"""
try:
directory_path = directory_path or DOCUMENTS_PATH
if not directory_path or not os.path.exists(directory_path):
raise ValueError(f"Invalid directory path: {directory_path}")
logger.info(f"Starting document indexing from: {directory_path}")
# Track progress
processed_files = set()
total_chunks = 0
processed_count = 0
# Process each file
for root, _, files in os.walk(directory_path):
for file in files:
if file.endswith(('.pdf', '.txt', '.md')):
file_path = os.path.join(root, file)
# Skip if already processed
if file_path in processed_files:
continue
try:
# Process document
texts = self.process_document(file_path)
if texts:
# Prepare data for ChromaDB with minimal metadata
documents = [text.page_content for text in texts]
metadatas = [{"source": file_path, "page": i} for i in range(len(texts))]
# Generate simple IDs
ids = [f"{os.path.basename(file_path)}_{i}" for i in range(len(texts))]
# Add to ChromaDB
self.collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
# Update statistics
total_chunks += len(texts)
processed_count += 1
processed_files.add(file_path)
if VERBOSE:
logger.debug(f"Indexed {len(texts)} chunks from {file_path}")
else:
logger.warning(f"No content extracted from: {file_path}")
except Exception as e:
logger.error(f"Error processing {file_path}: {str(e)}")
continue
logger.info(f"Indexing completed: {processed_count} files, {total_chunks} chunks")
return total_chunks > 0
except Exception as e:
logger.error(f"Error during indexing: {str(e)}")
return False