public static IEnumerable GetTextChunks()

in src/WebJobs.Extensions.OpenAI/Embeddings/EmbeddingsHelper.cs [80:162]


    public static IEnumerable<string> GetTextChunks(
        TextReader reader,
        int minChunkSize,
        int maxChunkSize,
        int overlap,
        char[]? sentenceEndings = null,
        char[]? wordBreaks = null)
    {
        if (reader == null)
        {
            throw new ArgumentNullException("reader");
        }

        if (minChunkSize < 0 || maxChunkSize <= 0 || overlap < 0 || minChunkSize > maxChunkSize || overlap > maxChunkSize)
        {
            throw new ArgumentException("Invalid chunk size or overlap");
        }

        char[] buffer = new char[maxChunkSize];
        int startIndex = 0;

        sentenceEndings ??= sentenceEndingsDefault;
        wordBreaks ??= wordBreaksDefault;

        HashSet<char> sentenceEndingsSet = new(sentenceEndings);
        HashSet<char> wordBreaksSet = new(wordBreaks);

        int bytesRead;
        while ((bytesRead = reader.Read(buffer, startIndex, maxChunkSize - startIndex)) > 0)
        {
            int endIndex = startIndex + bytesRead;
            int boundaryIndex = -1;

            // Search backwards to end the chunk with a terminator character  
            for (int i = endIndex - 1; i >= startIndex && i >= minChunkSize; i--)
            {
                if (sentenceEndingsSet.Contains(buffer[i]))
                {
                    boundaryIndex = i + 1;
                    break;
                }
            }

            // If sentence boundary not found, look for word breaks      
            if (boundaryIndex == -1)
            {
                for (int i = endIndex - 1; i >= startIndex && i >= minChunkSize; i--)
                {
                    if (wordBreaksSet.Contains(buffer[i]) && i < maxChunkSize)
                    {
                        boundaryIndex = i + 1;
                        break;
                    }
                }
            }

            // Didn't find anything to use as a boundary - just take the whole buffer  
            boundaryIndex = boundaryIndex <= 0 ? endIndex : boundaryIndex;

            // Yield this section of the buffer  
            string textChunk = new string(buffer, 0, boundaryIndex).Trim();
            yield return textChunk;

            // Find overlap start without word truncation
            int overlapIndex = Math.Max(0, boundaryIndex - overlap);
            while (overlapIndex < boundaryIndex && !wordBreaksSet.Contains(buffer[overlapIndex]))
            {
                overlapIndex++;
            }

            // Shift the remaining bytes including overlap into the front of the buffer  
            int remainingBytes = endIndex - overlapIndex;
            if (remainingBytes > 0)
            {
                Array.Copy(buffer, overlapIndex, buffer, 0, remainingBytes);
                startIndex = remainingBytes;
            }
            else
            {
                startIndex = 0;
            }
        }
    }