assets/large_language_models/rag/components/generate_embeddings_parallel/spec.yaml (62 lines of code) (raw):
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: parallel
tags:
Preview: ""
version: 0.0.79
name: llm_rag_generate_embeddings_parallel
display_name: LLM - Generate Embeddings Parallel
is_deterministic: true
description: |
Generates embeddings vectors for data chunks read from `chunks_source`.
`chunks_source` is expected to contain `csv` files containing two columns:
- "Chunk" - Chunk of text to be embedded
- "Metadata" - JSON object containing metadata for the chunk
If `previous_embeddings` is supplied, input chunks are compared to existing chunks in the Embeddings Container and only changed/new chunks are embedded, existing chunks being reused.
resources:
instance_count: -1
inputs:
chunks_source:
type: uri_folder
description: "Folder containing chunks to be embedded."
# If adding to previously generated Embeddings
embeddings_container:
type: uri_folder
optional: true
mode: direct
description: "Folder containing previously generated embeddings. Should be parent folder of the 'embeddings' output path used for for this component. Will compare input data to existing embeddings and only embed changed/new data, reusing existing chunks."
# Embeddings settings
embeddings_model:
type: string
default: "hugging_face://model/sentence-transformers/all-mpnet-base-v2"
description: "The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/all-mpnet-base-v2' or 'azure_open_ai://deployment/{deployment_name}/model/{model_name}'"
deployment_validation:
type: uri_file
description: "Uri file containing information on if the Azure OpenAI deployments, if used, have been validated"
optional: True
outputs:
embeddings:
type: uri_folder
description: "Where to save data with embeddings. This should be a subfolder of previous embeddings if supplied, typically named using '${name}'. e.g. /my/prev/embeddings/${name}"
mode: rw_mount
processed_file_names:
type: uri_file
description: "Text file containing the names of the files that were processed"
mode: rw_mount
mini_batch_size: "3"
mini_batch_error_threshold: 0
logging_level: "INFO"
input_data: ${{inputs.chunks_source}}
retry_settings:
max_retries: 3
timeout: 3600
task:
type: run_function
code: '../src'
entry_script: embeddings/tasks/embed_prs.py
environment: azureml:llm-rag-embeddings@latest
program_arguments: >-
--output_data ${{outputs.embeddings}}
$[[--embeddings_container ${{inputs.embeddings_container}}]]
--embeddings_model ${{inputs.embeddings_model}}
--task_overhead_timeout 1200
--progress_update_timeout 600
--first_task_creation_timeout 600
append_row_to: ${{outputs.processed_file_names}}