assets/large_language_models/rag/components/crack_and_chunk/spec.yaml (84 lines of code) (raw):
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: command
tags:
Preview: ""
version: 0.0.80
name: llm_rag_crack_and_chunk
display_name: LLM - Crack and Chunk Data
is_deterministic: true
description: |
Creates chunks no larger than `chunk_size` from `input_data`, extracted document titles are prepended to each chunk
LLM models have token limits for the prompts passed to them, this is a limiting factor at embedding time and even more limiting at prompt completion time as only so much context can be passed along with instructions to the LLM and user queries.
Chunking allows splitting source data of various formats into small but coherent snippets of information which can be 'packed' into LLM prompts when asking for answers to user query related to the source documents.
Supported formats: md, txt, html/htm, pdf, ppt(x), doc(x), xls(x), py
inputs:
# Input AzureML Data
input_data:
type: uri_folder
mode: rw_mount
description: "Uri Folder containing files to be chunked."
# Files to handle from source
input_glob:
type: string
optional: true
description: "Limit files opened from `input_data`, defaults to '**/*'."
allowed_extensions:
type: string
optional: true
description: "Comma separated list of extensions to include, if not provided the default list of supported extensions will be used. e.g. '.md,.txt,.html,.py,.pdf.'"
# Chunking options
chunk_size:
type: integer
default: 768
description: "Maximum number of tokens to put in each chunk."
chunk_overlap:
type: integer
default: 0
description: "Number of tokens to overlap between chunks."
doc_intel_connection_id:
type: string
optional: true
description: "Connection id for Document Intelligence service. If provided, will be used to extract content from .pdf document."
data_source_url:
type: string
optional: true
description: "Base URL to join with file paths to create full source file URL for chunk metadata."
document_path_replacement_regex:
type: string
optional: true
description: "A JSON string with two fields, 'match_pattern' and 'replacement_pattern' to be used with re.sub on the source url. e.g. '{\"match_pattern\": \"(.*)/articles/(.*)(\\\\.[^.]+)$\", \"replacement_pattern\": \"\\\\1/\\\\2\"}' would remove '/articles' from the middle of the url."
max_sample_files:
type: integer
default: -1
description: "Number of files to chunk. Specify -1 to chunk all documents in input path."
use_rcts:
type: string
default: "True"
enum:
- "True"
- "False"
description: "Whether to use RecursiveCharacterTextSplitter to split documents into chunks"
output_format:
type: string
default: "jsonl"
enum:
- "csv"
- "jsonl"
description: "Format of the output chunk file"
outputs:
output_chunks:
type: uri_folder
description: "Uri Folder containing chunks. Each chunk will be a separate file in the folder"
environment: azureml:llm-rag-embeddings@latest
code: '../src'
command: >-
python -m azureml.rag.tasks.crack_and_chunk
--input_data '${{inputs.input_data}}'
$[[--input_glob '${{inputs.input_glob}}']]
$[[--allowed_extensions ${{inputs.allowed_extensions}}]]
--output_chunks ${{outputs.output_chunks}}
--chunk_size ${{inputs.chunk_size}}
--chunk_overlap ${{inputs.chunk_overlap}}
$[[--doc_intel_connection_id ${{inputs.doc_intel_connection_id}}]]
$[[--data_source_url ${{inputs.data_source_url}}]]
$[[--document_path_replacement_regex '${{inputs.document_path_replacement_regex}}']]
--max_sample_files ${{inputs.max_sample_files}}
--use_rcts '${{inputs.use_rcts}}'
--output_format ${{inputs.output_format}}