pdf.js/src/core/xref.js

988 lines
30 KiB
JavaScript
Raw Normal View History

/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import {
assert,
bytesToString,
FormatError,
info,
InvalidPDFException,
warn,
} from "../shared/util.js";
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
import { CIRCULAR_REF, Cmd, Dict, isCmd, Ref, RefSet } from "./primitives.js";
import { Lexer, Parser } from "./parser.js";
import {
MissingDataException,
ParserEOFException,
XRefEntryException,
XRefParseException,
} from "./core_utils.js";
import { BaseStream } from "./base_stream.js";
import { CipherTransformFactory } from "./crypto.js";
2021-04-14 01:26:12 +09:00
class XRef {
constructor(stream, pdfManager) {
this.stream = stream;
this.pdfManager = pdfManager;
this.entries = [];
this._xrefStms = new Set();
this._cacheMap = new Map(); // Prepare the XRef cache.
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
this._pendingRefs = new RefSet();
this._newPersistentRefNum = null;
this._newTemporaryRefNum = null;
}
getNewPersistentRef(obj) {
// When printing we don't care that much about the ref number by itself, it
// can increase for ever and it allows to keep some re-usable refs.
if (this._newPersistentRefNum === null) {
this._newPersistentRefNum = this.entries.length || 1;
2021-04-14 01:26:12 +09:00
}
const num = this._newPersistentRefNum++;
this._cacheMap.set(num, obj);
return Ref.get(num, 0);
2021-04-14 01:26:12 +09:00
}
getNewTemporaryRef() {
// When saving we want to have some minimal numbers.
// Those refs are only created in order to be written in the final pdf
// stream.
if (this._newTemporaryRefNum === null) {
this._newTemporaryRefNum = this.entries.length || 1;
}
return Ref.get(this._newTemporaryRefNum++, 0);
}
resetNewTemporaryRef() {
// Called once saving is finished.
this._newTemporaryRefNum = null;
2021-04-14 01:26:12 +09:00
}
2021-04-14 01:26:12 +09:00
setStartXRef(startXRef) {
// Store the starting positions of xref tables as we process them
// so we can recover from missing data errors
this.startXRefQueue = [startXRef];
}
2021-04-14 01:26:12 +09:00
parse(recoveryMode = false) {
let trailerDict;
2021-04-14 01:26:12 +09:00
if (!recoveryMode) {
trailerDict = this.readXRef();
} else {
warn("Indexing all PDF objects");
trailerDict = this.indexObjects();
}
trailerDict.assignXref(this);
this.trailer = trailerDict;
let encrypt;
try {
encrypt = trailerDict.get("Encrypt");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn(`XRef.parse - Invalid "Encrypt" reference: "${ex}".`);
}
if (encrypt instanceof Dict) {
const ids = trailerDict.get("ID");
const fileId = ids?.length ? ids[0] : "";
2021-04-14 01:26:12 +09:00
// The 'Encrypt' dictionary itself should not be encrypted, and by
// setting `suppressEncryption` we can prevent an infinite loop inside
// of `XRef_fetchUncompressed` if the dictionary contains indirect
// objects (fixes issue7665.pdf).
encrypt.suppressEncryption = true;
this.encrypt = new CipherTransformFactory(
encrypt,
fileId,
this.pdfManager.password
);
}
// Get the root dictionary (catalog) object, and do some basic validation.
let root;
try {
root = trailerDict.get("Root");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn(`XRef.parse - Invalid "Root" reference: "${ex}".`);
}
2021-12-03 00:40:31 +09:00
if (root instanceof Dict) {
try {
const pages = root.get("Pages");
if (pages instanceof Dict) {
this.root = root;
return;
}
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn(`XRef.parse - Invalid "Pages" reference: "${ex}".`);
}
2021-04-14 01:26:12 +09:00
}
2021-12-03 00:40:31 +09:00
if (!recoveryMode) {
throw new XRefParseException();
}
// Even recovery failed, there's nothing more we can do here.
throw new InvalidPDFException("Invalid Root reference.");
2021-04-14 01:26:12 +09:00
}
2021-04-14 01:26:12 +09:00
processXRefTable(parser) {
if (!("tableState" in this)) {
// Stores state of the table as we process it so we can resume
// from middle of table in case of missing data error
this.tableState = {
entryNum: 0,
streamPos: parser.lexer.stream.pos,
parserBuf1: parser.buf1,
parserBuf2: parser.buf2,
};
}
const obj = this.readXRefTable(parser);
2021-04-14 01:26:12 +09:00
// Sanity check
if (!isCmd(obj, "trailer")) {
throw new FormatError(
"Invalid XRef table: could not find trailer dictionary"
);
}
// Read trailer dictionary, e.g.
// trailer
// << /Size 22
// /Root 20R
// /Info 10R
// /ID [ <81b14aafa313db63dbd6f981e49f94f4> ]
// >>
// The parser goes through the entire stream << ... >> and provides
// a getter interface for the key-value table
let dict = parser.getObj();
2021-04-14 01:26:12 +09:00
// The pdflib PDF generator can generate a nested trailer dictionary
if (!(dict instanceof Dict) && dict.dict) {
2021-04-14 01:26:12 +09:00
dict = dict.dict;
}
if (!(dict instanceof Dict)) {
2021-04-14 01:26:12 +09:00
throw new FormatError(
"Invalid XRef table: could not parse trailer dictionary"
);
}
delete this.tableState;
2021-04-14 01:26:12 +09:00
return dict;
}
readXRefTable(parser) {
// Example of cross-reference table:
// xref
// 0 1 <-- subsection header (first obj #, obj count)
// 0000000000 65535 f <-- actual object (offset, generation #, f/n)
// 23 2 <-- subsection header ... and so on ...
// 0000025518 00002 n
// 0000025635 00000 n
// trailer
// ...
const stream = parser.lexer.stream;
const tableState = this.tableState;
2021-04-14 01:26:12 +09:00
stream.pos = tableState.streamPos;
parser.buf1 = tableState.parserBuf1;
parser.buf2 = tableState.parserBuf2;
// Outer loop is over subsection headers
let obj;
2021-04-14 01:26:12 +09:00
while (true) {
if (!("firstEntryNum" in tableState) || !("entryCount" in tableState)) {
if (isCmd((obj = parser.getObj()), "trailer")) {
break;
}
2021-04-14 01:26:12 +09:00
tableState.firstEntryNum = obj;
tableState.entryCount = parser.getObj();
}
let first = tableState.firstEntryNum;
const count = tableState.entryCount;
2021-04-14 01:26:12 +09:00
if (!Number.isInteger(first) || !Number.isInteger(count)) {
throw new FormatError(
2021-04-14 01:26:12 +09:00
"Invalid XRef table: wrong types in subsection header"
);
}
2021-04-14 01:26:12 +09:00
// Inner loop is over objects themselves
for (let i = tableState.entryNum; i < count; i++) {
2021-04-14 01:26:12 +09:00
tableState.streamPos = stream.pos;
tableState.entryNum = i;
tableState.parserBuf1 = parser.buf1;
tableState.parserBuf2 = parser.buf2;
const entry = {};
2021-04-14 01:26:12 +09:00
entry.offset = parser.getObj();
entry.gen = parser.getObj();
const type = parser.getObj();
2021-04-14 01:26:12 +09:00
if (type instanceof Cmd) {
switch (type.cmd) {
case "f":
entry.free = true;
break;
case "n":
entry.uncompressed = true;
break;
}
}
2021-04-14 01:26:12 +09:00
// Validate entry obj
if (
!Number.isInteger(entry.offset) ||
!Number.isInteger(entry.gen) ||
!(entry.free || entry.uncompressed)
) {
throw new FormatError(
2021-04-14 01:26:12 +09:00
`Invalid entry in XRef subsection: ${first}, ${count}`
);
}
2021-04-14 01:26:12 +09:00
// The first xref table entry, i.e. obj 0, should be free. Attempting
// to adjust an incorrect first obj # (fixes issue 3248 and 7229).
if (i === 0 && entry.free && first === 1) {
first = 0;
}
2021-04-14 01:26:12 +09:00
if (!this.entries[i + first]) {
this.entries[i + first] = entry;
}
}
2021-04-14 01:26:12 +09:00
tableState.entryNum = 0;
tableState.streamPos = stream.pos;
tableState.parserBuf1 = parser.buf1;
tableState.parserBuf2 = parser.buf2;
delete tableState.firstEntryNum;
delete tableState.entryCount;
}
2021-04-14 01:26:12 +09:00
// Sanity check: as per spec, first object must be free
if (this.entries[0] && !this.entries[0].free) {
throw new FormatError("Invalid XRef table: unexpected first object");
}
return obj;
}
processXRefStream(stream) {
if (!("streamState" in this)) {
// Stores state of the stream as we process it so we can resume
// from middle of stream in case of missing data error
const streamParameters = stream.dict;
const byteWidths = streamParameters.get("W");
let range = streamParameters.get("Index");
2021-04-14 01:26:12 +09:00
if (!range) {
range = [0, streamParameters.get("Size")];
}
this.streamState = {
entryRanges: range,
byteWidths,
entryNum: 0,
streamPos: stream.pos,
};
}
this.readXRefStream(stream);
delete this.streamState;
return stream.dict;
}
2021-04-14 01:26:12 +09:00
readXRefStream(stream) {
const streamState = this.streamState;
2021-04-14 01:26:12 +09:00
stream.pos = streamState.streamPos;
const [typeFieldWidth, offsetFieldWidth, generationFieldWidth] =
streamState.byteWidths;
const entryRanges = streamState.entryRanges;
2021-04-14 01:26:12 +09:00
while (entryRanges.length > 0) {
const [first, n] = entryRanges;
2021-04-14 01:26:12 +09:00
if (!Number.isInteger(first) || !Number.isInteger(n)) {
throw new FormatError(`Invalid XRef range fields: ${first}, ${n}`);
}
if (
!Number.isInteger(typeFieldWidth) ||
!Number.isInteger(offsetFieldWidth) ||
!Number.isInteger(generationFieldWidth)
) {
throw new FormatError(
`Invalid XRef entry fields length: ${first}, ${n}`
);
}
for (let i = streamState.entryNum; i < n; ++i) {
2021-04-14 01:26:12 +09:00
streamState.entryNum = i;
streamState.streamPos = stream.pos;
let type = 0,
2021-04-14 01:26:12 +09:00
offset = 0,
generation = 0;
for (let j = 0; j < typeFieldWidth; ++j) {
const typeByte = stream.getByte();
if (typeByte === -1) {
throw new FormatError("Invalid XRef byteWidths 'type'.");
}
type = (type << 8) | typeByte;
}
2021-04-14 01:26:12 +09:00
// if type field is absent, its default value is 1
if (typeFieldWidth === 0) {
type = 1;
}
for (let j = 0; j < offsetFieldWidth; ++j) {
const offsetByte = stream.getByte();
if (offsetByte === -1) {
throw new FormatError("Invalid XRef byteWidths 'offset'.");
}
offset = (offset << 8) | offsetByte;
}
for (let j = 0; j < generationFieldWidth; ++j) {
const generationByte = stream.getByte();
if (generationByte === -1) {
throw new FormatError("Invalid XRef byteWidths 'generation'.");
}
generation = (generation << 8) | generationByte;
2021-04-14 01:26:12 +09:00
}
const entry = {};
2021-04-14 01:26:12 +09:00
entry.offset = offset;
entry.gen = generation;
switch (type) {
case 0:
entry.free = true;
break;
2021-04-14 01:26:12 +09:00
case 1:
entry.uncompressed = true;
break;
case 2:
break;
default:
throw new FormatError(`Invalid XRef entry type: ${type}`);
}
2021-04-14 01:26:12 +09:00
if (!this.entries[first + i]) {
this.entries[first + i] = entry;
}
}
2021-04-14 01:26:12 +09:00
streamState.entryNum = 0;
streamState.streamPos = stream.pos;
entryRanges.splice(0, 2);
}
}
indexObjects() {
// Simple scan through the PDF content to find objects,
// trailers and XRef streams.
const TAB = 0x9,
2021-04-14 01:26:12 +09:00
LF = 0xa,
CR = 0xd,
SPACE = 0x20;
const PERCENT = 0x25,
2021-04-14 01:26:12 +09:00
LT = 0x3c;
function readToken(data, offset) {
let token = "",
2021-04-14 01:26:12 +09:00
ch = data[offset];
while (ch !== LF && ch !== CR && ch !== LT) {
if (++offset >= data.length) {
break;
}
2021-04-14 01:26:12 +09:00
token += String.fromCharCode(ch);
ch = data[offset];
}
return token;
}
function skipUntil(data, offset, what) {
const length = what.length,
2021-04-14 01:26:12 +09:00
dataLength = data.length;
let skipped = 0;
2021-04-14 01:26:12 +09:00
// finding byte sequence
while (offset < dataLength) {
let i = 0;
2021-04-14 01:26:12 +09:00
while (i < length && data[offset + i] === what[i]) {
++i;
}
2021-04-14 01:26:12 +09:00
if (i >= length) {
break; // sequence found
}
offset++;
skipped++;
}
return skipped;
}
const gEndobjRegExp = /\b(endobj|\d+\s+\d+\s+obj|xref|trailer)\b/g;
const gStartxrefRegExp = /\b(startxref|\d+\s+\d+\s+obj)\b/g;
const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
2021-04-14 01:26:12 +09:00
const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
const startxrefBytes = new Uint8Array([
115, 116, 97, 114, 116, 120, 114, 101, 102,
]);
const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
2021-04-14 01:26:12 +09:00
// Clear out any existing entries, since they may be bogus.
this.entries.length = 0;
2021-12-03 00:40:31 +09:00
this._cacheMap.clear();
2021-04-14 01:26:12 +09:00
const stream = this.stream;
2021-04-14 01:26:12 +09:00
stream.pos = 0;
const buffer = stream.getBytes(),
bufferStr = bytesToString(buffer),
2021-04-14 01:26:12 +09:00
length = buffer.length;
let position = stream.start;
const trailers = [],
2021-04-14 01:26:12 +09:00
xrefStms = [];
while (position < length) {
let ch = buffer[position];
2021-04-14 01:26:12 +09:00
if (ch === TAB || ch === LF || ch === CR || ch === SPACE) {
++position;
continue;
}
if (ch === PERCENT) {
// %-comment
do {
++position;
if (position >= length) {
break;
}
2021-04-14 01:26:12 +09:00
ch = buffer[position];
} while (ch !== LF && ch !== CR);
continue;
}
const token = readToken(buffer, position);
let m;
2021-04-14 01:26:12 +09:00
if (
token.startsWith("xref") &&
(token.length === 4 || /\s/.test(token[4]))
) {
position += skipUntil(buffer, position, trailerBytes);
trailers.push(position);
position += skipUntil(buffer, position, startxrefBytes);
} else if ((m = objRegExp.exec(token))) {
const num = m[1] | 0,
gen = m[2] | 0;
const startPos = position + token.length;
let contentLength,
updateEntries = false;
if (!this.entries[num]) {
updateEntries = true;
} else if (this.entries[num].gen === gen) {
// Before overwriting an existing entry, ensure that the new one won't
// cause *immediate* errors when it's accessed (fixes issue13783.pdf).
try {
const parser = new Parser({
lexer: new Lexer(stream.makeSubStream(startPos)),
});
parser.getObj();
updateEntries = true;
} catch (ex) {
if (ex instanceof ParserEOFException) {
warn(`indexObjects -- checking object (${token}): "${ex}".`);
} else {
// The error may come from the `Parser`-instance being initialized
// without an `XRef`-instance (we don't have a usable one yet).
updateEntries = true;
}
}
}
if (updateEntries) {
2021-04-14 01:26:12 +09:00
this.entries[num] = {
offset: position - stream.start,
gen,
uncompressed: true,
};
}
2021-04-14 01:26:12 +09:00
// Find the next "obj" string, rather than "endobj", to ensure that
// we won't skip over a new 'obj' operator in corrupt files where
// 'endobj' operators are missing (fixes issue9105_reduced.pdf).
gEndobjRegExp.lastIndex = startPos;
const match = gEndobjRegExp.exec(bufferStr);
if (match) {
const endPos = gEndobjRegExp.lastIndex + 1;
contentLength = endPos - position;
if (match[1] !== "endobj") {
warn(
`indexObjects: Found "${match[1]}" inside of another "obj", ` +
'caused by missing "endobj" -- trying to recover.'
);
contentLength -= match[1].length + 1;
}
} else {
contentLength = length - position;
2021-04-14 01:26:12 +09:00
}
const content = buffer.subarray(position, position + contentLength);
// checking XRef stream suspect
// (it shall have '/XRef' and next char is not a letter)
const xrefTagOffset = skipUntil(content, 0, xrefBytes);
2021-04-14 01:26:12 +09:00
if (xrefTagOffset < contentLength && content[xrefTagOffset + 5] < 64) {
xrefStms.push(position - stream.start);
this._xrefStms.add(position - stream.start); // Avoid recursion
}
2021-04-14 01:26:12 +09:00
position += contentLength;
} else if (
token.startsWith("trailer") &&
(token.length === 7 || /\s/.test(token[7]))
) {
trailers.push(position);
const startPos = position + token.length;
let contentLength;
// Attempt to handle (some) corrupt documents, where no 'startxref'
// operators are present (fixes issue15590.pdf).
gStartxrefRegExp.lastIndex = startPos;
const match = gStartxrefRegExp.exec(bufferStr);
if (match) {
const endPos = gStartxrefRegExp.lastIndex + 1;
contentLength = endPos - position;
if (match[1] !== "startxref") {
warn(
`indexObjects: Found "${match[1]}" after "trailer", ` +
'caused by missing "startxref" -- trying to recover.'
);
contentLength -= match[1].length + 1;
}
} else {
contentLength = length - position;
}
position += contentLength;
2021-04-14 01:26:12 +09:00
} else {
position += token.length + 1;
}
}
// reading XRef streams
for (const xrefStm of xrefStms) {
this.startXRefQueue.push(xrefStm);
2021-04-14 01:26:12 +09:00
this.readXRef(/* recoveryMode */ true);
}
const trailerDicts = [];
// Pre-parsing the trailers to check if the document is possibly encrypted.
let isEncrypted = false;
for (const trailer of trailers) {
stream.pos = trailer;
2021-04-14 01:26:12 +09:00
const parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
recoveryMode: true,
});
const obj = parser.getObj();
2021-04-14 01:26:12 +09:00
if (!isCmd(obj, "trailer")) {
continue;
}
2021-04-14 01:26:12 +09:00
// read the trailer dictionary
const dict = parser.getObj();
if (!(dict instanceof Dict)) {
2021-04-14 01:26:12 +09:00
continue;
}
trailerDicts.push(dict);
if (dict.has("Encrypt")) {
isEncrypted = true;
}
}
// finding main trailer
let trailerDict, trailerError;
for (const dict of [...trailerDicts, "genFallback", ...trailerDicts]) {
if (dict === "genFallback") {
if (!trailerError) {
break; // No need to fallback if there were no validation errors.
}
this._generationFallback = true;
continue;
}
2021-04-14 01:26:12 +09:00
// Do some basic validation of the trailer/root dictionary candidate.
let validPagesDict = false;
2021-04-14 01:26:12 +09:00
try {
const rootDict = dict.get("Root");
if (!(rootDict instanceof Dict)) {
continue;
}
2021-04-14 01:26:12 +09:00
const pagesDict = rootDict.get("Pages");
if (!(pagesDict instanceof Dict)) {
continue;
}
const pagesCount = pagesDict.get("Count");
if (Number.isInteger(pagesCount)) {
validPagesDict = true;
}
2021-04-14 01:26:12 +09:00
// The top-level /Pages dictionary isn't obviously corrupt.
} catch (ex) {
trailerError = ex;
2021-04-14 01:26:12 +09:00
continue;
}
// taking the first one with 'ID'
if (
validPagesDict &&
(!isEncrypted || dict.has("Encrypt")) &&
dict.has("ID")
) {
2021-04-14 01:26:12 +09:00
return dict;
}
// The current dictionary is a candidate, but continue searching.
trailerDict = dict;
}
// No trailer with 'ID', taking last one (if exists).
if (trailerDict) {
return trailerDict;
}
// No trailer dictionary found, taking the "top"-dictionary (if exists).
if (this.topDict) {
return this.topDict;
}
2021-04-14 01:26:12 +09:00
// nothing helps
throw new InvalidPDFException("Invalid PDF structure.");
}
2021-04-14 01:26:12 +09:00
readXRef(recoveryMode = false) {
const stream = this.stream;
2021-04-14 01:26:12 +09:00
// Keep track of already parsed XRef tables, to prevent an infinite loop
// when parsing corrupt PDF files where e.g. the /Prev entries create a
// circular dependency between tables (fixes bug1393476.pdf).
const startXRefParsedCache = new Set();
while (this.startXRefQueue.length) {
try {
const startXRef = this.startXRefQueue[0];
2021-04-14 01:26:12 +09:00
if (startXRefParsedCache.has(startXRef)) {
warn("readXRef - skipping XRef table since it was already parsed.");
this.startXRefQueue.shift();
continue;
}
startXRefParsedCache.add(startXRef);
2021-04-14 01:26:12 +09:00
stream.pos = startXRef + stream.start;
const parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
});
let obj = parser.getObj();
let dict;
2021-04-14 01:26:12 +09:00
// Get dictionary
if (isCmd(obj, "xref")) {
// Parse end-of-file XRef
dict = this.processXRefTable(parser);
if (!this.topDict) {
this.topDict = dict;
}
2021-04-14 01:26:12 +09:00
// Recursively get other XRefs 'XRefStm', if any
obj = dict.get("XRefStm");
if (Number.isInteger(obj) && !this._xrefStms.has(obj)) {
2021-04-14 01:26:12 +09:00
// ignore previously loaded xref streams
// (possible infinite recursion)
this._xrefStms.add(obj);
this.startXRefQueue.push(obj);
}
2021-04-14 01:26:12 +09:00
} else if (Number.isInteger(obj)) {
// Parse in-stream XRef
if (
!Number.isInteger(parser.getObj()) ||
!isCmd(parser.getObj(), "obj") ||
!((obj = parser.getObj()) instanceof BaseStream)
2021-04-14 01:26:12 +09:00
) {
throw new FormatError("Invalid XRef stream");
}
dict = this.processXRefStream(obj);
if (!this.topDict) {
this.topDict = dict;
}
if (!dict) {
throw new FormatError("Failed to read XRef stream");
}
} else {
throw new FormatError("Invalid XRef stream header");
}
2021-04-14 01:26:12 +09:00
// Recursively get previous dictionary, if any
obj = dict.get("Prev");
if (Number.isInteger(obj)) {
this.startXRefQueue.push(obj);
} else if (obj instanceof Ref) {
2021-04-14 01:26:12 +09:00
// The spec says Prev must not be a reference, i.e. "/Prev NNN"
// This is a fallback for non-compliant PDFs, i.e. "/Prev NNN 0 R"
this.startXRefQueue.push(obj.num);
}
} catch (e) {
if (e instanceof MissingDataException) {
throw e;
}
info("(while reading XRef): " + e);
}
this.startXRefQueue.shift();
2021-04-14 01:26:12 +09:00
}
if (this.topDict) {
return this.topDict;
}
2021-04-14 01:26:12 +09:00
if (recoveryMode) {
return undefined;
}
throw new XRefParseException();
}
get lastXRefStreamPos() {
return this._xrefStms.size > 0 ? Math.max(...this._xrefStms) : null;
}
2021-04-14 01:26:12 +09:00
getEntry(i) {
const xrefEntry = this.entries[i];
2021-04-14 01:26:12 +09:00
if (xrefEntry && !xrefEntry.free && xrefEntry.offset) {
return xrefEntry;
}
return null;
}
2021-04-14 01:26:12 +09:00
fetchIfRef(obj, suppressEncryption = false) {
if (obj instanceof Ref) {
return this.fetch(obj, suppressEncryption);
}
return obj;
}
2021-04-14 01:26:12 +09:00
fetch(ref, suppressEncryption = false) {
if (!(ref instanceof Ref)) {
throw new Error("ref object is not a reference");
}
const num = ref.num;
// The XRef cache is populated with objects which are obtained through
// `Parser.getObj`, and indirectly via `Lexer.getObj`. Neither of these
// methods should ever return `undefined` (note the `assert` calls below).
const cacheEntry = this._cacheMap.get(num);
if (cacheEntry !== undefined) {
// In documents with Object Streams, it's possible that cached `Dict`s
// have not been assigned an `objId` yet (see e.g. issue3115r.pdf).
if (cacheEntry instanceof Dict && !cacheEntry.objId) {
cacheEntry.objId = ref.toString();
}
return cacheEntry;
}
let xrefEntry = this.getEntry(num);
if (xrefEntry === null) {
// The referenced entry can be free.
this._cacheMap.set(num, xrefEntry);
return xrefEntry;
}
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
// Prevent circular references, in corrupt PDF documents, from hanging the
// worker-thread. This relies, implicitly, on the parsing being synchronous.
if (this._pendingRefs.has(ref)) {
this._pendingRefs.remove(ref);
2021-04-14 01:26:12 +09:00
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
warn(`Ignoring circular reference: ${ref}.`);
return CIRCULAR_REF;
}
this._pendingRefs.put(ref);
try {
if (xrefEntry.uncompressed) {
xrefEntry = this.fetchUncompressed(ref, xrefEntry, suppressEncryption);
} else {
xrefEntry = this.fetchCompressed(ref, xrefEntry, suppressEncryption);
}
this._pendingRefs.remove(ref);
} catch (ex) {
this._pendingRefs.remove(ref);
throw ex;
2021-04-14 01:26:12 +09:00
}
if (xrefEntry instanceof Dict) {
2021-04-14 01:26:12 +09:00
xrefEntry.objId = ref.toString();
} else if (xrefEntry instanceof BaseStream) {
2021-04-14 01:26:12 +09:00
xrefEntry.dict.objId = ref.toString();
}
return xrefEntry;
}
fetchUncompressed(ref, xrefEntry, suppressEncryption = false) {
const gen = ref.gen;
let num = ref.num;
2021-04-14 01:26:12 +09:00
if (xrefEntry.gen !== gen) {
const msg = `Inconsistent generation in XRef: ${ref}`;
// Try falling back to a *previous* generation (fixes issue15577.pdf).
if (this._generationFallback && xrefEntry.gen < gen) {
warn(msg);
return this.fetchUncompressed(
Ref.get(num, xrefEntry.gen),
xrefEntry,
suppressEncryption
);
}
throw new XRefEntryException(msg);
2021-04-14 01:26:12 +09:00
}
const stream = this.stream.makeSubStream(
2021-04-14 01:26:12 +09:00
xrefEntry.offset + this.stream.start
);
const parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
});
const obj1 = parser.getObj();
const obj2 = parser.getObj();
const obj3 = parser.getObj();
2021-04-14 01:26:12 +09:00
if (obj1 !== num || obj2 !== gen || !(obj3 instanceof Cmd)) {
throw new XRefEntryException(`Bad (uncompressed) XRef entry: ${ref}`);
}
if (obj3.cmd !== "obj") {
// some bad PDFs use "obj1234" and really mean 1234
if (obj3.cmd.startsWith("obj")) {
num = parseInt(obj3.cmd.substring(3), 10);
if (!Number.isNaN(num)) {
return num;
}
}
2021-04-14 01:26:12 +09:00
throw new XRefEntryException(`Bad (uncompressed) XRef entry: ${ref}`);
}
if (this.encrypt && !suppressEncryption) {
xrefEntry = parser.getObj(this.encrypt.createCipherTransform(num, gen));
} else {
xrefEntry = parser.getObj();
}
if (!(xrefEntry instanceof BaseStream)) {
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
2021-04-14 01:26:12 +09:00
assert(
xrefEntry !== undefined,
'fetchUncompressed: The "xrefEntry" cannot be undefined.'
);
}
2021-04-14 01:26:12 +09:00
this._cacheMap.set(num, xrefEntry);
}
return xrefEntry;
}
2021-04-14 01:26:12 +09:00
fetchCompressed(ref, xrefEntry, suppressEncryption = false) {
const tableOffset = xrefEntry.offset;
const stream = this.fetch(Ref.get(tableOffset, 0));
if (!(stream instanceof BaseStream)) {
2021-04-14 01:26:12 +09:00
throw new FormatError("bad ObjStm stream");
}
const first = stream.dict.get("First");
const n = stream.dict.get("N");
if (!Number.isInteger(first) || !Number.isInteger(n)) {
throw new FormatError("invalid first and n parameters for ObjStm stream");
}
let parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
});
const nums = new Array(n);
const offsets = new Array(n);
// read the object numbers to populate cache
for (let i = 0; i < n; ++i) {
const num = parser.getObj();
if (!Number.isInteger(num)) {
throw new FormatError(
`invalid object number in the ObjStm stream: ${num}`
);
}
2021-04-14 01:26:12 +09:00
const offset = parser.getObj();
if (!Number.isInteger(offset)) {
throw new FormatError(
`invalid object offset in the ObjStm stream: ${offset}`
);
}
nums[i] = num;
offsets[i] = offset;
}
const start = (stream.start || 0) + first;
const entries = new Array(n);
// read stream objects for cache
for (let i = 0; i < n; ++i) {
const length = i < n - 1 ? offsets[i + 1] - offsets[i] : undefined;
if (length < 0) {
throw new FormatError("Invalid offset in the ObjStm stream.");
}
parser = new Parser({
lexer: new Lexer(
stream.makeSubStream(start + offsets[i], length, stream.dict)
),
xref: this,
allowStreams: true,
});
2021-04-14 01:26:12 +09:00
const obj = parser.getObj();
entries[i] = obj;
if (obj instanceof BaseStream) {
2021-04-14 01:26:12 +09:00
continue;
}
2021-04-14 01:26:12 +09:00
const num = nums[i],
entry = this.entries[num];
if (entry && entry.offset === tableOffset && entry.gen === i) {
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
assert(
2021-04-14 01:26:12 +09:00
obj !== undefined,
'fetchCompressed: The "obj" cannot be undefined.'
);
}
2021-04-14 01:26:12 +09:00
this._cacheMap.set(num, obj);
}
}
xrefEntry = entries[xrefEntry.gen];
if (xrefEntry === undefined) {
throw new XRefEntryException(`Bad (compressed) XRef entry: ${ref}`);
}
return xrefEntry;
}
2021-04-14 01:26:12 +09:00
async fetchIfRefAsync(obj, suppressEncryption) {
if (obj instanceof Ref) {
return this.fetchAsync(obj, suppressEncryption);
}
return obj;
}
2021-04-14 01:26:12 +09:00
async fetchAsync(ref, suppressEncryption) {
try {
return this.fetch(ref, suppressEncryption);
} catch (ex) {
if (!(ex instanceof MissingDataException)) {
throw ex;
}
2021-04-14 01:26:12 +09:00
await this.pdfManager.requestRange(ex.begin, ex.end);
return this.fetchAsync(ref, suppressEncryption);
}
}
2021-04-14 01:26:12 +09:00
getCatalogObj() {
return this.root;
}
}
export { XRef };