assets/large_language_models/rag/components/qa_data_generation/spec.yaml (65 lines of code) (raw):
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: command
tags:
Preview: ""
version: 0.0.78
name: llm_rag_qa_data_generation
display_name: LLM - Generate QnA Test Data
is_deterministic: true
description: |
Generates a test dataset of questions and answers based on the input documents.
A chunk of text is read from each input document and sent to the specified LLM with a prompt to create a question and answer based on that text. These question, answer, and context sets are saved as either a csv or jsonl file.
Short-answer, long-answer, summary, and boolean-based QAs are generated.
inputs:
openai_api_version:
type: string
default: "2023-03-15-preview"
description: "Version of OpenAI API to use for communicating with LLM."
openai_api_type:
type: string
default: "azure"
description: "Type of OpenAI endpoint hosting model. Defaults to azure for AOAI endpoints."
input_data:
type: uri_folder
description: "Uri folder of documents containing chunks of data."
llm_config:
type: string
default: '{"type": "azure_open_ai", "model_name": "gpt-35-turbo", "deployment_name": "gpt-35-turbo", "temperature": 0, "max_tokens": 2000}'
description: "JSON Configuration for what model to use for question generation. Must contain following keys: 'type' (value must be 'azure_open_ai' or 'azure'), 'model_name' (name of model to use for summary), 'deployment_name' (name of deployment for model), 'temperature' (randomness in response, float from 0 to 1), 'max_tokens' (number of tokens for response)."
llm_connection:
type: string
optional: False
description: "Workspace connection resource ID for the completion model."
dataset_size:
type: integer
default: 100
description: "Number of questions to generate"
chunk_batch_size:
type: integer
default: 5
description: "Number of chunks to be read and sent to LLM in parallel"
output_format:
type: string
default: 'json'
description: "File type to save the dataset as. Options are 'csv' and 'json'"
deployment_validation:
type: uri_file
description: "Uri file containing information on if the Azure OpenAI deployments, if used, have been validated"
optional: True
outputs:
output_data:
type: uri_folder
description: "csv or jsonl file containing the question, answer, context, and metadata sets"
environment: azureml:llm-rag-embeddings@latest
code: '../src' # nothing used from here
command: >-
python -m azureml.rag.tasks.generate_qa
--input-data '${{inputs.input_data}}'
--output-data ${{outputs.output_data}}
--dataset_size ${{inputs.dataset_size}}
--chunk_batch_size ${{inputs.chunk_batch_size}}
--llm_config '${{inputs.llm_config}}'
--llm_connection '${{inputs.llm_connection}}'
--output_format '${{inputs.output_format}}'
--openai_api_version '${{inputs.openai_api_version}}'
--openai_api_type '${{inputs.openai_api_type}}'