in src/WebJobs.Extensions.OpenAI/Embeddings/EmbeddingsHelper.cs [80:162]
public static IEnumerable<string> GetTextChunks(
TextReader reader,
int minChunkSize,
int maxChunkSize,
int overlap,
char[]? sentenceEndings = null,
char[]? wordBreaks = null)
{
if (reader == null)
{
throw new ArgumentNullException("reader");
}
if (minChunkSize < 0 || maxChunkSize <= 0 || overlap < 0 || minChunkSize > maxChunkSize || overlap > maxChunkSize)
{
throw new ArgumentException("Invalid chunk size or overlap");
}
char[] buffer = new char[maxChunkSize];
int startIndex = 0;
sentenceEndings ??= sentenceEndingsDefault;
wordBreaks ??= wordBreaksDefault;
HashSet<char> sentenceEndingsSet = new(sentenceEndings);
HashSet<char> wordBreaksSet = new(wordBreaks);
int bytesRead;
while ((bytesRead = reader.Read(buffer, startIndex, maxChunkSize - startIndex)) > 0)
{
int endIndex = startIndex + bytesRead;
int boundaryIndex = -1;
// Search backwards to end the chunk with a terminator character
for (int i = endIndex - 1; i >= startIndex && i >= minChunkSize; i--)
{
if (sentenceEndingsSet.Contains(buffer[i]))
{
boundaryIndex = i + 1;
break;
}
}
// If sentence boundary not found, look for word breaks
if (boundaryIndex == -1)
{
for (int i = endIndex - 1; i >= startIndex && i >= minChunkSize; i--)
{
if (wordBreaksSet.Contains(buffer[i]) && i < maxChunkSize)
{
boundaryIndex = i + 1;
break;
}
}
}
// Didn't find anything to use as a boundary - just take the whole buffer
boundaryIndex = boundaryIndex <= 0 ? endIndex : boundaryIndex;
// Yield this section of the buffer
string textChunk = new string(buffer, 0, boundaryIndex).Trim();
yield return textChunk;
// Find overlap start without word truncation
int overlapIndex = Math.Max(0, boundaryIndex - overlap);
while (overlapIndex < boundaryIndex && !wordBreaksSet.Contains(buffer[overlapIndex]))
{
overlapIndex++;
}
// Shift the remaining bytes including overlap into the front of the buffer
int remainingBytes = endIndex - overlapIndex;
if (remainingBytes > 0)
{
Array.Copy(buffer, overlapIndex, buffer, 0, remainingBytes);
startIndex = remainingBytes;
}
else
{
startIndex = 0;
}
}
}