function find_context_core()

in codex-cli/src/utils/agent/apply-patch.ts [332:430]


function find_context_core(
  lines: Array<string>,
  context: Array<string>,
  start: number,
): [number, number] {
  // ---------------------------------------------------------------------------
  // Helpers – Unicode punctuation normalisation
  // ---------------------------------------------------------------------------

  /*
   * The patch-matching algorithm originally required **exact** string equality
   * for non-whitespace characters.  That breaks when the file on disk contains
   * visually identical but different Unicode code-points (e.g. “EN DASH” vs
   * ASCII "-"), because models almost always emit the ASCII variant.  To make
   * apply_patch resilient we canonicalise a handful of common punctuation
   * look-alikes before doing comparisons.
   *
   * We purposefully keep the mapping *small* – only characters that routinely
   * appear in source files and are highly unlikely to introduce ambiguity are
   * included.  Each entry is written using the corresponding Unicode escape so
   * that the file remains ASCII-only even after transpilation.
   */

  const PUNCT_EQUIV: Record<string, string> = {
    // Hyphen / dash variants --------------------------------------------------
    /* U+002D HYPHEN-MINUS */ "-": "-",
    /* U+2010 HYPHEN */ "\u2010": "-",
    /* U+2011 NO-BREAK HYPHEN */ "\u2011": "-",
    /* U+2012 FIGURE DASH */ "\u2012": "-",
    /* U+2013 EN DASH */ "\u2013": "-",
    /* U+2014 EM DASH */ "\u2014": "-",
    /* U+2212 MINUS SIGN */ "\u2212": "-",

    // Double quotes -----------------------------------------------------------
    /* U+0022 QUOTATION MARK */ "\u0022": '"',
    /* U+201C LEFT DOUBLE QUOTATION MARK */ "\u201C": '"',
    /* U+201D RIGHT DOUBLE QUOTATION MARK */ "\u201D": '"',
    /* U+201E DOUBLE LOW-9 QUOTATION MARK */ "\u201E": '"',
    /* U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ "\u00AB": '"',
    /* U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ "\u00BB": '"',

    // Single quotes -----------------------------------------------------------
    /* U+0027 APOSTROPHE */ "\u0027": "'",
    /* U+2018 LEFT SINGLE QUOTATION MARK */ "\u2018": "'",
    /* U+2019 RIGHT SINGLE QUOTATION MARK */ "\u2019": "'",
    /* U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK */ "\u201B": "'",
    // Spaces ------------------------------------------------------------------
    /* U+00A0 NO-BREAK SPACE */ "\u00A0": " ",
    /* U+202F NARROW NO-BREAK SPACE */ "\u202F": " ",
  };

  const canon = (s: string): string =>
    s
      // Canonical Unicode composition first
      .normalize("NFC")
      // Replace punctuation look-alikes
      .replace(/./gu, (c) => PUNCT_EQUIV[c] ?? c);
  if (context.length === 0) {
    return [start, 0];
  }
  // Pass 1 – exact equality after canonicalisation ---------------------------
  const canonicalContext = canon(context.join("\n"));
  for (let i = start; i < lines.length; i++) {
    const segment = canon(lines.slice(i, i + context.length).join("\n"));
    if (segment === canonicalContext) {
      return [i, 0];
    }
  }

  // Pass 2 – ignore trailing whitespace -------------------------------------
  for (let i = start; i < lines.length; i++) {
    const segment = canon(
      lines
        .slice(i, i + context.length)
        .map((s) => s.trimEnd())
        .join("\n"),
    );
    const ctx = canon(context.map((s) => s.trimEnd()).join("\n"));
    if (segment === ctx) {
      return [i, 1];
    }
  }

  // Pass 3 – ignore all surrounding whitespace ------------------------------
  for (let i = start; i < lines.length; i++) {
    const segment = canon(
      lines
        .slice(i, i + context.length)
        .map((s) => s.trim())
        .join("\n"),
    );
    const ctx = canon(context.map((s) => s.trim()).join("\n"));
    if (segment === ctx) {
      return [i, 100];
    }
  }

  return [-1, 0];
}