assets/large_language_models/rag/components/generate_embeddings/spec.yaml (54 lines of code) (raw):
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: command
tags:
Preview: ""
version: 0.0.73
name: llm_rag_generate_embeddings
display_name: LLM - Generate Embeddings
is_deterministic: true
description: |
Generates embeddings vectors for data chunks read from `chunks_source`.
`chunks_source` is expected to contain `csv` files containing two columns:
- "Chunk" - Chunk of text to be embedded
- "Metadata" - JSON object containing metadata for the chunk
If `embeddings_container` is supplied, input chunks are compared to existing chunks in the Embeddings Container and only changed/new chunks are embedded, existing chunks being reused.
inputs:
chunks_source:
type: uri_folder
description: "Folder containing chunks to be embedded."
# If adding to previously generated Embeddings
embeddings_container:
type: uri_folder
optional: true
mode: direct
description: "Folder containing previously generated embeddings. Should be parent folder of the 'embeddings' output path used for for this component. Will compare input data to existing embeddings and only embed changed/new data, reusing existing chunks."
# Embeddings settings
embeddings_model:
type: string
optional: True
description: "The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/all-mpnet-base-v2' or 'azure_open_ai://deployment/{deployment_name}/model/{model_name}'"
batch_size:
type: integer
default: 100
description: "Batch size to use when embedding data"
num_workers:
type: integer
default: -1
description: "Number of workers to use when embedding data. -1 means use half all available CPUs"
deployment_validation:
type: uri_file
description: "Uri file containing information on if the Azure OpenAI deployments, if used, have been validated"
optional: True
outputs:
embeddings:
type: uri_folder
description: "Where to save data with embeddings. This should be a subfolder of previous embeddings if supplied, typically named using '${name}'. e.g. /my/prev/embeddings/${name}"
mode: rw_mount
environment: azureml:llm-rag-embeddings@latest
code: '../src'
command: >-
python -m azureml.rag.tasks.embed
--chunks_source '${{inputs.chunks_source}}'
$[[--embeddings_model ${{inputs.embeddings_model}}]]
$[[--embeddings_container '${{inputs.embeddings_container}}']]
--output '${{outputs.embeddings}}'
--batch_size ${{inputs.batch_size}}
--num_workers ${{inputs.num_workers}}