export function textQuoteSelectorMatcher()

in packages/selector/src/text/match-text-quote.ts [68:214]


export function textQuoteSelectorMatcher(
  selector: TextQuoteSelector,
): <TChunk extends Chunk<any>>(
  scope: Chunker<TChunk>,
) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
  return async function* matchAll<TChunk extends Chunk<string>>(
    textChunks: Chunker<TChunk>,
  ) {
    const exact = selector.exact;
    const prefix = selector.prefix || '';
    const suffix = selector.suffix || '';
    const searchPattern = prefix + exact + suffix;

    // The code below essentially just performs string.indexOf(searchPattern),
    // but on a string that is chopped up in multiple chunks. It runs a loop
    // containing three steps:
    // 1. Continue checking any partial matches from the previous chunk(s).
    // 2. Try find the whole pattern in the chunk (possibly multiple times).
    // 3. Check if this chunk ends with a partial match (or even multiple partial matches).

    interface PartialMatch {
      startChunk?: TChunk;
      startIndex?: number;
      endChunk?: TChunk;
      endIndex?: number;
      charactersMatched: number;
    }
    let partialMatches: PartialMatch[] = [];

    let isFirstChunk = true;
    do {
      const chunk = textChunks.currentChunk;
      const chunkValue = chunk.data;

      // 1. Continue checking any partial matches from the previous chunk(s).
      const remainingPartialMatches: typeof partialMatches = [];
      for (const partialMatch of partialMatches) {
        const charactersMatched = partialMatch.charactersMatched;

        // If the current chunk contains the start and/or end of the match, record these.
        if (partialMatch.endChunk === undefined) {
          const charactersUntilMatchEnd =
            prefix.length + exact.length - charactersMatched;
          if (charactersUntilMatchEnd <= chunkValue.length) {
            partialMatch.endChunk = chunk;
            partialMatch.endIndex = charactersUntilMatchEnd;
          }
        }
        if (partialMatch.startChunk === undefined) {
          const charactersUntilMatchStart = prefix.length - charactersMatched;
          if (
            charactersUntilMatchStart < chunkValue.length ||
            partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
          ) {
            partialMatch.startChunk = chunk;
            partialMatch.startIndex = charactersUntilMatchStart;
          }
        }

        const charactersUntilSuffixEnd =
          searchPattern.length - charactersMatched;
        if (charactersUntilSuffixEnd <= chunkValue.length) {
          if (
            chunkValue.startsWith(searchPattern.substring(charactersMatched))
          ) {
            yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now.
          }
        } else if (
          chunkValue ===
          searchPattern.substring(
            charactersMatched,
            charactersMatched + chunkValue.length,
          )
        ) {
          // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
          partialMatch.charactersMatched += chunkValue.length;
          remainingPartialMatches.push(partialMatch);
        }
      }
      partialMatches = remainingPartialMatches;

      // 2. Try find the whole pattern in the chunk (possibly multiple times).
      if (searchPattern.length <= chunkValue.length) {
        let fromIndex = 0;
        while (fromIndex <= chunkValue.length) {
          const patternStartIndex = chunkValue.indexOf(
            searchPattern,
            fromIndex,
          );
          if (patternStartIndex === -1) break;
          fromIndex = patternStartIndex + 1;

          // Handle edge case: an empty searchPattern would already have been yielded at the end of the last chunk.
          if (
            patternStartIndex === 0 &&
            searchPattern.length === 0 &&
            !isFirstChunk
          )
            continue;

          yield {
            startChunk: chunk,
            startIndex: patternStartIndex + prefix.length,
            endChunk: chunk,
            endIndex: patternStartIndex + prefix.length + exact.length,
          };
        }
      }

      // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
      let newPartialMatches: number[] = [];
      const searchStartPoint = Math.max(
        chunkValue.length - searchPattern.length + 1,
        0,
      );
      for (let i = searchStartPoint; i < chunkValue.length; i++) {
        const character = chunkValue[i];
        newPartialMatches = newPartialMatches.filter(
          (partialMatchStartIndex) =>
            character === searchPattern[i - partialMatchStartIndex],
        );
        if (character === searchPattern[0]) newPartialMatches.push(i);
      }
      for (const partialMatchStartIndex of newPartialMatches) {
        const charactersMatched = chunkValue.length - partialMatchStartIndex;
        const partialMatch: PartialMatch = {
          charactersMatched,
        };
        if (charactersMatched >= prefix.length + exact.length) {
          partialMatch.endChunk = chunk;
          partialMatch.endIndex =
            partialMatchStartIndex + prefix.length + exact.length;
        }
        if (
          charactersMatched > prefix.length ||
          partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
        ) {
          partialMatch.startChunk = chunk;
          partialMatch.startIndex = partialMatchStartIndex + prefix.length;
        }
        partialMatches.push(partialMatch);
      }

      isFirstChunk = false;
    } while (textChunks.nextChunk() !== null);
  };
}