indexObjects()

in src/core/xref.js [411:707]


  indexObjects() {
    // Simple scan through the PDF content to find objects,
    // trailers and XRef streams.
    const TAB = 0x9,
      LF = 0xa,
      CR = 0xd,
      SPACE = 0x20;
    const PERCENT = 0x25,
      LT = 0x3c;

    function readToken(data, offset) {
      let token = "",
        ch = data[offset];
      while (ch !== LF && ch !== CR && ch !== LT) {
        if (++offset >= data.length) {
          break;
        }
        token += String.fromCharCode(ch);
        ch = data[offset];
      }
      return token;
    }
    function skipUntil(data, offset, what) {
      const length = what.length,
        dataLength = data.length;
      let skipped = 0;
      // finding byte sequence
      while (offset < dataLength) {
        let i = 0;
        while (i < length && data[offset + i] === what[i]) {
          ++i;
        }
        if (i >= length) {
          break; // sequence found
        }
        offset++;
        skipped++;
      }
      return skipped;
    }
    const gEndobjRegExp = /\b(endobj|\d+\s+\d+\s+obj|xref|trailer\s*<<)\b/g;
    const gStartxrefRegExp = /\b(startxref|\d+\s+\d+\s+obj)\b/g;
    const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;

    const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
    const startxrefBytes = new Uint8Array([
      115, 116, 97, 114, 116, 120, 114, 101, 102,
    ]);
    const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);

    // Clear out any existing entries, since they may be bogus.
    this.entries.length = 0;
    this._cacheMap.clear();

    const stream = this.stream;
    stream.pos = 0;
    const buffer = stream.getBytes(),
      bufferStr = bytesToString(buffer),
      length = buffer.length;
    let position = stream.start;
    const trailers = [],
      xrefStms = [];
    while (position < length) {
      let ch = buffer[position];
      if (ch === TAB || ch === LF || ch === CR || ch === SPACE) {
        ++position;
        continue;
      }
      if (ch === PERCENT) {
        // %-comment
        do {
          ++position;
          if (position >= length) {
            break;
          }
          ch = buffer[position];
        } while (ch !== LF && ch !== CR);
        continue;
      }
      const token = readToken(buffer, position);
      let m;
      if (
        token.startsWith("xref") &&
        (token.length === 4 || /\s/.test(token[4]))
      ) {
        position += skipUntil(buffer, position, trailerBytes);
        trailers.push(position);
        position += skipUntil(buffer, position, startxrefBytes);
      } else if ((m = objRegExp.exec(token))) {
        const num = m[1] | 0,
          gen = m[2] | 0;

        const startPos = position + token.length;
        let contentLength,
          updateEntries = false;
        if (!this.entries[num]) {
          updateEntries = true;
        } else if (this.entries[num].gen === gen) {
          // Before overwriting an existing entry, ensure that the new one won't
          // cause *immediate* errors when it's accessed (fixes issue13783.pdf).
          try {
            const parser = new Parser({
              lexer: new Lexer(stream.makeSubStream(startPos)),
            });
            parser.getObj();
            updateEntries = true;
          } catch (ex) {
            if (ex instanceof ParserEOFException) {
              warn(`indexObjects -- checking object (${token}): "${ex}".`);
            } else {
              // The error may come from the `Parser`-instance being initialized
              // without an `XRef`-instance (we don't have a usable one yet).
              updateEntries = true;
            }
          }
        }
        if (updateEntries) {
          this.entries[num] = {
            offset: position - stream.start,
            gen,
            uncompressed: true,
          };
        }

        // Find the next "obj" string, rather than "endobj", to ensure that
        // we won't skip over a new 'obj' operator in corrupt files where
        // 'endobj' operators are missing (fixes issue9105_reduced.pdf).
        gEndobjRegExp.lastIndex = startPos;
        const match = gEndobjRegExp.exec(bufferStr);

        if (match) {
          const endPos = gEndobjRegExp.lastIndex + 1;
          contentLength = endPos - position;

          if (match[1] !== "endobj") {
            warn(
              `indexObjects: Found "${match[1]}" inside of another "obj", ` +
                'caused by missing "endobj" -- trying to recover.'
            );
            contentLength -= match[1].length + 1;
          }
        } else {
          contentLength = length - position;
        }
        const content = buffer.subarray(position, position + contentLength);

        // checking XRef stream suspect
        // (it shall have '/XRef' and next char is not a letter)
        const xrefTagOffset = skipUntil(content, 0, xrefBytes);
        if (xrefTagOffset < contentLength && content[xrefTagOffset + 5] < 64) {
          xrefStms.push(position - stream.start);
          this._xrefStms.add(position - stream.start); // Avoid recursion
        }

        position += contentLength;
      } else if (
        token.startsWith("trailer") &&
        (token.length === 7 || /\s/.test(token[7]))
      ) {
        trailers.push(position);

        const startPos = position + token.length;
        let contentLength;
        // Attempt to handle (some) corrupt documents, where no 'startxref'
        // operators are present (fixes issue15590.pdf).
        gStartxrefRegExp.lastIndex = startPos;
        const match = gStartxrefRegExp.exec(bufferStr);

        if (match) {
          const endPos = gStartxrefRegExp.lastIndex + 1;
          contentLength = endPos - position;

          if (match[1] !== "startxref") {
            warn(
              `indexObjects: Found "${match[1]}" after "trailer", ` +
                'caused by missing "startxref" -- trying to recover.'
            );
            contentLength -= match[1].length + 1;
          }
        } else {
          contentLength = length - position;
        }
        position += contentLength;
      } else {
        position += token.length + 1;
      }
    }
    // reading XRef streams
    for (const xrefStm of xrefStms) {
      this.startXRefQueue.push(xrefStm);
      this.readXRef(/* recoveryMode */ true);
    }

    const trailerDicts = [];
    // Pre-parsing the trailers to check if the document is possibly encrypted.
    let isEncrypted = false;
    for (const trailer of trailers) {
      stream.pos = trailer;
      const parser = new Parser({
        lexer: new Lexer(stream),
        xref: this,
        allowStreams: true,
        recoveryMode: true,
      });
      const obj = parser.getObj();
      if (!isCmd(obj, "trailer")) {
        continue;
      }
      // read the trailer dictionary
      const dict = parser.getObj();
      if (!(dict instanceof Dict)) {
        continue;
      }
      trailerDicts.push(dict);

      if (dict.has("Encrypt")) {
        isEncrypted = true;
      }
    }

    // finding main trailer
    let trailerDict, trailerError;
    for (const dict of [...trailerDicts, "genFallback", ...trailerDicts]) {
      if (dict === "genFallback") {
        if (!trailerError) {
          break; // No need to fallback if there were no validation errors.
        }
        this._generationFallback = true;
        continue;
      }
      // Do some basic validation of the trailer/root dictionary candidate.
      let validPagesDict = false;
      try {
        const rootDict = dict.get("Root");
        if (!(rootDict instanceof Dict)) {
          continue;
        }
        const pagesDict = rootDict.get("Pages");
        if (!(pagesDict instanceof Dict)) {
          continue;
        }
        const pagesCount = pagesDict.get("Count");
        if (Number.isInteger(pagesCount)) {
          validPagesDict = true;
        }
        // The top-level /Pages dictionary isn't obviously corrupt.
      } catch (ex) {
        trailerError = ex;
        continue;
      }
      // taking the first one with 'ID'
      if (
        validPagesDict &&
        (!isEncrypted || dict.has("Encrypt")) &&
        dict.has("ID")
      ) {
        return dict;
      }
      // The current dictionary is a candidate, but continue searching.
      trailerDict = dict;
    }
    // No trailer with 'ID', taking last one (if exists).
    if (trailerDict) {
      return trailerDict;
    }
    // No trailer dictionary found, taking the "top"-dictionary (if exists).
    if (this.topDict) {
      return this.topDict;
    }

    // When no trailer dictionary candidate exists, try picking the first
    // dictionary that contains a /Root entry (fixes issue18986.pdf).
    if (!trailerDicts.length) {
      for (const [num, entry] of this.entries.entries()) {
        if (!entry) {
          continue;
        }
        const ref = Ref.get(num, entry.gen);
        let obj;

        try {
          obj = this.fetch(ref);
        } catch {
          continue;
        }
        if (obj instanceof BaseStream) {
          obj = obj.dict;
        }
        if (obj instanceof Dict && obj.has("Root")) {
          return obj;
        }
      }
    }

    // nothing helps
    throw new InvalidPDFException("Invalid PDF structure.");
  }