in src/core/xref.js [411:707]
indexObjects() {
// Simple scan through the PDF content to find objects,
// trailers and XRef streams.
const TAB = 0x9,
LF = 0xa,
CR = 0xd,
SPACE = 0x20;
const PERCENT = 0x25,
LT = 0x3c;
function readToken(data, offset) {
let token = "",
ch = data[offset];
while (ch !== LF && ch !== CR && ch !== LT) {
if (++offset >= data.length) {
break;
}
token += String.fromCharCode(ch);
ch = data[offset];
}
return token;
}
function skipUntil(data, offset, what) {
const length = what.length,
dataLength = data.length;
let skipped = 0;
// finding byte sequence
while (offset < dataLength) {
let i = 0;
while (i < length && data[offset + i] === what[i]) {
++i;
}
if (i >= length) {
break; // sequence found
}
offset++;
skipped++;
}
return skipped;
}
const gEndobjRegExp = /\b(endobj|\d+\s+\d+\s+obj|xref|trailer\s*<<)\b/g;
const gStartxrefRegExp = /\b(startxref|\d+\s+\d+\s+obj)\b/g;
const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
const startxrefBytes = new Uint8Array([
115, 116, 97, 114, 116, 120, 114, 101, 102,
]);
const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
// Clear out any existing entries, since they may be bogus.
this.entries.length = 0;
this._cacheMap.clear();
const stream = this.stream;
stream.pos = 0;
const buffer = stream.getBytes(),
bufferStr = bytesToString(buffer),
length = buffer.length;
let position = stream.start;
const trailers = [],
xrefStms = [];
while (position < length) {
let ch = buffer[position];
if (ch === TAB || ch === LF || ch === CR || ch === SPACE) {
++position;
continue;
}
if (ch === PERCENT) {
// %-comment
do {
++position;
if (position >= length) {
break;
}
ch = buffer[position];
} while (ch !== LF && ch !== CR);
continue;
}
const token = readToken(buffer, position);
let m;
if (
token.startsWith("xref") &&
(token.length === 4 || /\s/.test(token[4]))
) {
position += skipUntil(buffer, position, trailerBytes);
trailers.push(position);
position += skipUntil(buffer, position, startxrefBytes);
} else if ((m = objRegExp.exec(token))) {
const num = m[1] | 0,
gen = m[2] | 0;
const startPos = position + token.length;
let contentLength,
updateEntries = false;
if (!this.entries[num]) {
updateEntries = true;
} else if (this.entries[num].gen === gen) {
// Before overwriting an existing entry, ensure that the new one won't
// cause *immediate* errors when it's accessed (fixes issue13783.pdf).
try {
const parser = new Parser({
lexer: new Lexer(stream.makeSubStream(startPos)),
});
parser.getObj();
updateEntries = true;
} catch (ex) {
if (ex instanceof ParserEOFException) {
warn(`indexObjects -- checking object (${token}): "${ex}".`);
} else {
// The error may come from the `Parser`-instance being initialized
// without an `XRef`-instance (we don't have a usable one yet).
updateEntries = true;
}
}
}
if (updateEntries) {
this.entries[num] = {
offset: position - stream.start,
gen,
uncompressed: true,
};
}
// Find the next "obj" string, rather than "endobj", to ensure that
// we won't skip over a new 'obj' operator in corrupt files where
// 'endobj' operators are missing (fixes issue9105_reduced.pdf).
gEndobjRegExp.lastIndex = startPos;
const match = gEndobjRegExp.exec(bufferStr);
if (match) {
const endPos = gEndobjRegExp.lastIndex + 1;
contentLength = endPos - position;
if (match[1] !== "endobj") {
warn(
`indexObjects: Found "${match[1]}" inside of another "obj", ` +
'caused by missing "endobj" -- trying to recover.'
);
contentLength -= match[1].length + 1;
}
} else {
contentLength = length - position;
}
const content = buffer.subarray(position, position + contentLength);
// checking XRef stream suspect
// (it shall have '/XRef' and next char is not a letter)
const xrefTagOffset = skipUntil(content, 0, xrefBytes);
if (xrefTagOffset < contentLength && content[xrefTagOffset + 5] < 64) {
xrefStms.push(position - stream.start);
this._xrefStms.add(position - stream.start); // Avoid recursion
}
position += contentLength;
} else if (
token.startsWith("trailer") &&
(token.length === 7 || /\s/.test(token[7]))
) {
trailers.push(position);
const startPos = position + token.length;
let contentLength;
// Attempt to handle (some) corrupt documents, where no 'startxref'
// operators are present (fixes issue15590.pdf).
gStartxrefRegExp.lastIndex = startPos;
const match = gStartxrefRegExp.exec(bufferStr);
if (match) {
const endPos = gStartxrefRegExp.lastIndex + 1;
contentLength = endPos - position;
if (match[1] !== "startxref") {
warn(
`indexObjects: Found "${match[1]}" after "trailer", ` +
'caused by missing "startxref" -- trying to recover.'
);
contentLength -= match[1].length + 1;
}
} else {
contentLength = length - position;
}
position += contentLength;
} else {
position += token.length + 1;
}
}
// reading XRef streams
for (const xrefStm of xrefStms) {
this.startXRefQueue.push(xrefStm);
this.readXRef(/* recoveryMode */ true);
}
const trailerDicts = [];
// Pre-parsing the trailers to check if the document is possibly encrypted.
let isEncrypted = false;
for (const trailer of trailers) {
stream.pos = trailer;
const parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
recoveryMode: true,
});
const obj = parser.getObj();
if (!isCmd(obj, "trailer")) {
continue;
}
// read the trailer dictionary
const dict = parser.getObj();
if (!(dict instanceof Dict)) {
continue;
}
trailerDicts.push(dict);
if (dict.has("Encrypt")) {
isEncrypted = true;
}
}
// finding main trailer
let trailerDict, trailerError;
for (const dict of [...trailerDicts, "genFallback", ...trailerDicts]) {
if (dict === "genFallback") {
if (!trailerError) {
break; // No need to fallback if there were no validation errors.
}
this._generationFallback = true;
continue;
}
// Do some basic validation of the trailer/root dictionary candidate.
let validPagesDict = false;
try {
const rootDict = dict.get("Root");
if (!(rootDict instanceof Dict)) {
continue;
}
const pagesDict = rootDict.get("Pages");
if (!(pagesDict instanceof Dict)) {
continue;
}
const pagesCount = pagesDict.get("Count");
if (Number.isInteger(pagesCount)) {
validPagesDict = true;
}
// The top-level /Pages dictionary isn't obviously corrupt.
} catch (ex) {
trailerError = ex;
continue;
}
// taking the first one with 'ID'
if (
validPagesDict &&
(!isEncrypted || dict.has("Encrypt")) &&
dict.has("ID")
) {
return dict;
}
// The current dictionary is a candidate, but continue searching.
trailerDict = dict;
}
// No trailer with 'ID', taking last one (if exists).
if (trailerDict) {
return trailerDict;
}
// No trailer dictionary found, taking the "top"-dictionary (if exists).
if (this.topDict) {
return this.topDict;
}
// When no trailer dictionary candidate exists, try picking the first
// dictionary that contains a /Root entry (fixes issue18986.pdf).
if (!trailerDicts.length) {
for (const [num, entry] of this.entries.entries()) {
if (!entry) {
continue;
}
const ref = Ref.get(num, entry.gen);
let obj;
try {
obj = this.fetch(ref);
} catch {
continue;
}
if (obj instanceof BaseStream) {
obj = obj.dict;
}
if (obj instanceof Dict && obj.has("Root")) {
return obj;
}
}
}
// nothing helps
throw new InvalidPDFException("Invalid PDF structure.");
}