assets/large_language_models/rag/components/qa_data_generation/spec.yaml (65 lines of code) (raw):

$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json type: command tags: Preview: "" version: 0.0.78 name: llm_rag_qa_data_generation display_name: LLM - Generate QnA Test Data is_deterministic: true description: | Generates a test dataset of questions and answers based on the input documents. A chunk of text is read from each input document and sent to the specified LLM with a prompt to create a question and answer based on that text. These question, answer, and context sets are saved as either a csv or jsonl file. Short-answer, long-answer, summary, and boolean-based QAs are generated. inputs: openai_api_version: type: string default: "2023-03-15-preview" description: "Version of OpenAI API to use for communicating with LLM." openai_api_type: type: string default: "azure" description: "Type of OpenAI endpoint hosting model. Defaults to azure for AOAI endpoints." input_data: type: uri_folder description: "Uri folder of documents containing chunks of data." llm_config: type: string default: '{"type": "azure_open_ai", "model_name": "gpt-35-turbo", "deployment_name": "gpt-35-turbo", "temperature": 0, "max_tokens": 2000}' description: "JSON Configuration for what model to use for question generation. Must contain following keys: 'type' (value must be 'azure_open_ai' or 'azure'), 'model_name' (name of model to use for summary), 'deployment_name' (name of deployment for model), 'temperature' (randomness in response, float from 0 to 1), 'max_tokens' (number of tokens for response)." llm_connection: type: string optional: False description: "Workspace connection resource ID for the completion model." dataset_size: type: integer default: 100 description: "Number of questions to generate" chunk_batch_size: type: integer default: 5 description: "Number of chunks to be read and sent to LLM in parallel" output_format: type: string default: 'json' description: "File type to save the dataset as. Options are 'csv' and 'json'" deployment_validation: type: uri_file description: "Uri file containing information on if the Azure OpenAI deployments, if used, have been validated" optional: True outputs: output_data: type: uri_folder description: "csv or jsonl file containing the question, answer, context, and metadata sets" environment: azureml:llm-rag-embeddings@latest code: '../src' # nothing used from here command: >- python -m azureml.rag.tasks.generate_qa --input-data '${{inputs.input_data}}' --output-data ${{outputs.output_data}} --dataset_size ${{inputs.dataset_size}} --chunk_batch_size ${{inputs.chunk_batch_size}} --llm_config '${{inputs.llm_config}}' --llm_connection '${{inputs.llm_connection}}' --output_format '${{inputs.output_format}}' --openai_api_version '${{inputs.openai_api_version}}' --openai_api_type '${{inputs.openai_api_type}}'