internal record struct TextChunk()

in AzureCosmosDB/csharp/DocumentVectorPipelineFunctions/DocumentChunker.cs [7:116]


internal record struct TextChunk(
    string Text,
    int ChunkNumber);

internal static class DocumentChunker
{
    public const int DefaultMaxTokensPerChunk = 250;
    public const int DefaultOverlapTokens = 0;

#pragma warning disable SKEXP0050 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.

    public static IEnumerable<TextChunk> FixedSizeChunking(
        AnalyzeResult? result,
        int maxTokensPerChunk,
        int overlapTokens)
    {
        if (result == null)
        {
            return [];
        }

        // Handle different types of output from Azure Document Intelligence.
        // This happens for different types of input. In particular, .docx files
        // don't seem to have lines populated.
        //
        // If it has a collection of pages with lines, use that.
        //
        // Otherwise if there are paragraphs, we'll use them as input.
        //
        // Third, we'll use the "words" collection of each page, building it up into a
        // roughly line sized blocks to pass in.
        //
        // Finally, if there is nothing else, we'll fall back to the Content property.
        IEnumerable<string> lines;
        if (result.Pages?.Count > 0 && result.Pages?[0]?.Lines?.Count > 0)
        {
            lines = result.Pages.SelectMany(page => page.Lines.Select(line => line.Content));
        }
        else if (result.Paragraphs?.Count > 0)
        {
            lines = result.Paragraphs.Select(para => para.Content);
        }
        else if (result.Pages?.Count > 0 && result.Pages?[0]?.Words?.Count > 0)
        {
            lines = SplitWords(result);
        }
        else
        {
            lines = [result.Content];
        }

        var chunkNumber = 0;
        return TextChunker.SplitPlainTextParagraphs(lines, maxTokensPerChunk, overlapTokens)
            .Select(para => new TextChunk(para, chunkNumber++));
    }

    public static IEnumerable<TextChunk> ChunkTextLines(
        IEnumerable<string> lines,
        int maxTokensPerChunk,
        int overlapTokens)
    {
        var chunkNumber = 0;
        return TextChunker.SplitMarkdownParagraphs(lines, maxTokensPerChunk, overlapTokens)
            .Select(para => new TextChunk(para, chunkNumber++));
    }

    public static IEnumerable<TextChunk> ChunkMarkdownLines(
        IEnumerable<string> lines,
        int maxTokensPerChunk,
        int overlapTokens)
    {
        var chunkNumber = 0;
        return TextChunker.SplitMarkdownParagraphs(lines, maxTokensPerChunk, overlapTokens)
            .Select(para => new TextChunk(para, chunkNumber++));
    }

#pragma warning restore SKEXP0050 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.

    private const int MaxChunkWordCount = 40;

    private static IEnumerable<string> SplitWords(AnalyzeResult result)
    {
        var sb = new StringBuilder(MaxChunkWordCount);
        var wordCount = 0;
        foreach (var page in result.Pages)
        {
            foreach (var word in page.Words)
            {
                sb.Append(word.Content).Append(' ');
                wordCount++;
                if (wordCount > MaxChunkWordCount)
                {
                    sb.Length -= 1;
                    var chunk = sb.ToString();
                    sb.Clear();
                    wordCount = 0;

                    yield return chunk;
                }
            }
        }

        if (sb.Length > 0)
        {
            sb.Length -= 1;
            var chunk = sb.ToString();
            yield return chunk;
        }
    }
}