assets/large_language_models/rag/components/generate_embeddings_parallel/spec.yaml (62 lines of code) (raw):

$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json type: parallel tags: Preview: "" version: 0.0.79 name: llm_rag_generate_embeddings_parallel display_name: LLM - Generate Embeddings Parallel is_deterministic: true description: | Generates embeddings vectors for data chunks read from `chunks_source`. `chunks_source` is expected to contain `csv` files containing two columns: - "Chunk" - Chunk of text to be embedded - "Metadata" - JSON object containing metadata for the chunk If `previous_embeddings` is supplied, input chunks are compared to existing chunks in the Embeddings Container and only changed/new chunks are embedded, existing chunks being reused. resources: instance_count: -1 inputs: chunks_source: type: uri_folder description: "Folder containing chunks to be embedded." # If adding to previously generated Embeddings embeddings_container: type: uri_folder optional: true mode: direct description: "Folder containing previously generated embeddings. Should be parent folder of the 'embeddings' output path used for for this component. Will compare input data to existing embeddings and only embed changed/new data, reusing existing chunks." # Embeddings settings embeddings_model: type: string default: "hugging_face://model/sentence-transformers/all-mpnet-base-v2" description: "The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/all-mpnet-base-v2' or 'azure_open_ai://deployment/{deployment_name}/model/{model_name}'" deployment_validation: type: uri_file description: "Uri file containing information on if the Azure OpenAI deployments, if used, have been validated" optional: True outputs: embeddings: type: uri_folder description: "Where to save data with embeddings. This should be a subfolder of previous embeddings if supplied, typically named using '${name}'. e.g. /my/prev/embeddings/${name}" mode: rw_mount processed_file_names: type: uri_file description: "Text file containing the names of the files that were processed" mode: rw_mount mini_batch_size: "3" mini_batch_error_threshold: 0 logging_level: "INFO" input_data: ${{inputs.chunks_source}} retry_settings: max_retries: 3 timeout: 3600 task: type: run_function code: '../src' entry_script: embeddings/tasks/embed_prs.py environment: azureml:llm-rag-embeddings@latest program_arguments: >- --output_data ${{outputs.embeddings}} $[[--embeddings_container ${{inputs.embeddings_container}}]] --embeddings_model ${{inputs.embeddings_model}} --task_overhead_timeout 1200 --progress_update_timeout 600 --first_task_creation_timeout 600 append_row_to: ${{outputs.processed_file_names}}