in packages/selector/src/text/match-text-quote.ts [68:214]
export function textQuoteSelectorMatcher(
selector: TextQuoteSelector,
): <TChunk extends Chunk<any>>(
scope: Chunker<TChunk>,
) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
return async function* matchAll<TChunk extends Chunk<string>>(
textChunks: Chunker<TChunk>,
) {
const exact = selector.exact;
const prefix = selector.prefix || '';
const suffix = selector.suffix || '';
const searchPattern = prefix + exact + suffix;
// The code below essentially just performs string.indexOf(searchPattern),
// but on a string that is chopped up in multiple chunks. It runs a loop
// containing three steps:
// 1. Continue checking any partial matches from the previous chunk(s).
// 2. Try find the whole pattern in the chunk (possibly multiple times).
// 3. Check if this chunk ends with a partial match (or even multiple partial matches).
interface PartialMatch {
startChunk?: TChunk;
startIndex?: number;
endChunk?: TChunk;
endIndex?: number;
charactersMatched: number;
}
let partialMatches: PartialMatch[] = [];
let isFirstChunk = true;
do {
const chunk = textChunks.currentChunk;
const chunkValue = chunk.data;
// 1. Continue checking any partial matches from the previous chunk(s).
const remainingPartialMatches: typeof partialMatches = [];
for (const partialMatch of partialMatches) {
const charactersMatched = partialMatch.charactersMatched;
// If the current chunk contains the start and/or end of the match, record these.
if (partialMatch.endChunk === undefined) {
const charactersUntilMatchEnd =
prefix.length + exact.length - charactersMatched;
if (charactersUntilMatchEnd <= chunkValue.length) {
partialMatch.endChunk = chunk;
partialMatch.endIndex = charactersUntilMatchEnd;
}
}
if (partialMatch.startChunk === undefined) {
const charactersUntilMatchStart = prefix.length - charactersMatched;
if (
charactersUntilMatchStart < chunkValue.length ||
partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
) {
partialMatch.startChunk = chunk;
partialMatch.startIndex = charactersUntilMatchStart;
}
}
const charactersUntilSuffixEnd =
searchPattern.length - charactersMatched;
if (charactersUntilSuffixEnd <= chunkValue.length) {
if (
chunkValue.startsWith(searchPattern.substring(charactersMatched))
) {
yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now.
}
} else if (
chunkValue ===
searchPattern.substring(
charactersMatched,
charactersMatched + chunkValue.length,
)
) {
// The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
partialMatch.charactersMatched += chunkValue.length;
remainingPartialMatches.push(partialMatch);
}
}
partialMatches = remainingPartialMatches;
// 2. Try find the whole pattern in the chunk (possibly multiple times).
if (searchPattern.length <= chunkValue.length) {
let fromIndex = 0;
while (fromIndex <= chunkValue.length) {
const patternStartIndex = chunkValue.indexOf(
searchPattern,
fromIndex,
);
if (patternStartIndex === -1) break;
fromIndex = patternStartIndex + 1;
// Handle edge case: an empty searchPattern would already have been yielded at the end of the last chunk.
if (
patternStartIndex === 0 &&
searchPattern.length === 0 &&
!isFirstChunk
)
continue;
yield {
startChunk: chunk,
startIndex: patternStartIndex + prefix.length,
endChunk: chunk,
endIndex: patternStartIndex + prefix.length + exact.length,
};
}
}
// 3. Check if this chunk ends with a partial match (or even multiple partial matches).
let newPartialMatches: number[] = [];
const searchStartPoint = Math.max(
chunkValue.length - searchPattern.length + 1,
0,
);
for (let i = searchStartPoint; i < chunkValue.length; i++) {
const character = chunkValue[i];
newPartialMatches = newPartialMatches.filter(
(partialMatchStartIndex) =>
character === searchPattern[i - partialMatchStartIndex],
);
if (character === searchPattern[0]) newPartialMatches.push(i);
}
for (const partialMatchStartIndex of newPartialMatches) {
const charactersMatched = chunkValue.length - partialMatchStartIndex;
const partialMatch: PartialMatch = {
charactersMatched,
};
if (charactersMatched >= prefix.length + exact.length) {
partialMatch.endChunk = chunk;
partialMatch.endIndex =
partialMatchStartIndex + prefix.length + exact.length;
}
if (
charactersMatched > prefix.length ||
partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
) {
partialMatch.startChunk = chunk;
partialMatch.startIndex = partialMatchStartIndex + prefix.length;
}
partialMatches.push(partialMatch);
}
isFirstChunk = false;
} while (textChunks.nextChunk() !== null);
};
}