def load()

in devai-cli/src/devai/commands/rag/load.py [0:0]


def load(repo, branch, db_path):
   
    repo_path = repo
    branch = branch
    local_dir = "./repo"
    try: 
        shutil.rmtree(local_dir)
    except:
        pass
    # Common source code file extensions and markdown
    allowed_extensions = [
        ".py", ".java", ".cpp", ".c", ".cs", ".js", ".ts",
        ".php", ".rb", ".go", ".swift", ".rs", ".md"
    ]

    # 1. Load filtered documents
    documents = load_docs(repo_path, branch, local_dir, allowed_extensions)

    # 2. Split into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)

    # 3. Generate Embeddings (Replace with your preferred embedding model if not using Vertex AI)
    EMBEDDING_QPM = 100
    EMBEDDING_NUM_BATCH = 5
    embeddings = VertexAIEmbeddings(
        requests_per_minute=EMBEDDING_QPM,
        num_instances_per_batch=EMBEDDING_NUM_BATCH,
        model_name="textembedding-gecko@latest",
    )


    # 4. Store in ChromaDB
    #cleanup previous run
    try:
        shutil.rmtree(db_path)
    except:
        pass

    db = Chroma.from_documents(
        texts, embeddings,
        persist_directory=db_path,
        collection_name="source_code_embeddings"
    )
    db.persist()
    # cleanup clone
    try:
        shutil.rmtree(local_dir)
    except:
        pass
    print("Done with load")