internal record struct TextChunk()

in AzureSQL/csharp/DocumentVectorPipelineFunctions/TextChunker.cs [6:50]


internal record struct TextChunk(
    string Text,
    int PageNumberIfKnown,
    int ChunkNumber);

internal class TextChunker
{
    private const int MaxChunkSize = 2048;

    public static IEnumerable<TextChunk> FixedSizeChunking(AnalyzeResult? result, int chunkSize = MaxChunkSize)
    {
        if (result == null)
        {
            yield break;
        }

        var sb = new StringBuilder(chunkSize);
        var pageIndex = 0;
        var chunkIndex = 0;
        foreach (var page in result.Pages)
        {
            foreach (var word in page.Words)
            {
                sb.Append(word.Content).Append(' ');
                if (sb.Length > chunkSize)
                {
                    sb.Length -= 1;
                    string chunk = sb.ToString();
                    sb.Clear();

                    yield return new TextChunk(chunk, pageIndex, chunkIndex);
                    chunkIndex++;
                }
            }
            pageIndex++;
        }

        if (sb.Length > 1)
        {
            sb.Length -= 1;
            string chunk = sb.ToString();
            yield return new TextChunk(chunk, pageIndex, chunkIndex);
        }
    }
}