in devai-cli/src/devai/commands/rag/load.py [0:0]
def load(repo, branch, db_path):
repo_path = repo
branch = branch
local_dir = "./repo"
try:
shutil.rmtree(local_dir)
except:
pass
# Common source code file extensions and markdown
allowed_extensions = [
".py", ".java", ".cpp", ".c", ".cs", ".js", ".ts",
".php", ".rb", ".go", ".swift", ".rs", ".md"
]
# 1. Load filtered documents
documents = load_docs(repo_path, branch, local_dir, allowed_extensions)
# 2. Split into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
# 3. Generate Embeddings (Replace with your preferred embedding model if not using Vertex AI)
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = VertexAIEmbeddings(
requests_per_minute=EMBEDDING_QPM,
num_instances_per_batch=EMBEDDING_NUM_BATCH,
model_name="textembedding-gecko@latest",
)
# 4. Store in ChromaDB
#cleanup previous run
try:
shutil.rmtree(db_path)
except:
pass
db = Chroma.from_documents(
texts, embeddings,
persist_directory=db_path,
collection_name="source_code_embeddings"
)
db.persist()
# cleanup clone
try:
shutil.rmtree(local_dir)
except:
pass
print("Done with load")