pdf.js/src/core/xref.js

934 lines
28 KiB
JavaScript
Raw Normal View History

/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import {
assert,
bytesToString,
FormatError,
info,
InvalidPDFException,
warn,
} from "../shared/util.js";
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
import { CIRCULAR_REF, Cmd, Dict, isCmd, Ref, RefSet } from "./primitives.js";
import {
[api-minor] Replace `PDFDocumentProxy.getStats` with a synchronous `PDFDocumentProxy.stats` getter *Please note:* These changes will primarily benefit longer documents, somewhat at the expense of e.g. one-page documents. The existing `PDFDocumentProxy.getStats` function, which in the default viewer is called for each rendered page, requires a round-trip to the worker-thread in order to obtain the current document stats. In the default viewer, we currently make one such API-call for *every rendered* page. This patch proposes replacing that method with a *synchronous* `PDFDocumentProxy.stats` getter instead, combined with re-factoring the worker-thread code by adding a `DocStats`-class to track Stream/Font-types and *only send* them to the main-thread *the first time* that a type is encountered. Note that in practice most PDF documents only use a fairly limited number of Stream/Font-types, which means that in longer documents most of the `PDFDocumentProxy.getStats`-calls will return the same data.[1] This re-factoring will obviously benefit longer document the most[2], and could actually be seen as a regression for one-page documents, since in practice there'll usually be a couple of "DocStats" messages sent during the parsing of the first page. However, if the user zooms/rotates the document (which causes re-rendering), note that even a one-page document would start to benefit from these changes. Another benefit of having the data available/cached in the API is that unless the document stats change during parsing, repeated `PDFDocumentProxy.stats`-calls will return *the same identical* object. This is something that we can easily take advantage of in the default viewer, by now *only* reporting "documentStats" telemetry[3] when the data actually have changed rather than once per rendered page (again beneficial in longer documents). --- [1] Furthermore, the maximium number of `StreamType`/`FontType` are `10` respectively `12`, which means that regardless of the complexity and page count in a PDF document there'll never be more than twenty-two "DocStats" messages sent; see https://github.com/mozilla/pdf.js/blob/41ac3f0c07128bf34baccdcc067a108c712fd6ef/src/shared/util.js#L206-L232 [2] One example is the `pdf.pdf` document in the test-suite, where rendering all of its 1310 pages only result in a total of seven "DocStats" messages being sent from the worker-thread. [3] Reporting telemetry, in Firefox, includes using `JSON.stringify` on the data and then sending an event to the `PdfStreamConverter.jsm`-code. In that code the event is handled and `JSON.parse` is used to retrieve the data, and in the "documentStats"-case we'll then iterate through the data to avoid double-reporting telemetry; see https://searchfox.org/mozilla-central/rev/8f4c180b87e52f3345ef8a3432d6e54bd1eb18dc/toolkit/components/pdfjs/content/PdfStreamConverter.jsm#515-549
2021-11-12 02:14:26 +09:00
DocStats,
MissingDataException,
ParserEOFException,
XRefEntryException,
XRefParseException,
} from "./core_utils.js";
[api-minor] Replace `PDFDocumentProxy.getStats` with a synchronous `PDFDocumentProxy.stats` getter *Please note:* These changes will primarily benefit longer documents, somewhat at the expense of e.g. one-page documents. The existing `PDFDocumentProxy.getStats` function, which in the default viewer is called for each rendered page, requires a round-trip to the worker-thread in order to obtain the current document stats. In the default viewer, we currently make one such API-call for *every rendered* page. This patch proposes replacing that method with a *synchronous* `PDFDocumentProxy.stats` getter instead, combined with re-factoring the worker-thread code by adding a `DocStats`-class to track Stream/Font-types and *only send* them to the main-thread *the first time* that a type is encountered. Note that in practice most PDF documents only use a fairly limited number of Stream/Font-types, which means that in longer documents most of the `PDFDocumentProxy.getStats`-calls will return the same data.[1] This re-factoring will obviously benefit longer document the most[2], and could actually be seen as a regression for one-page documents, since in practice there'll usually be a couple of "DocStats" messages sent during the parsing of the first page. However, if the user zooms/rotates the document (which causes re-rendering), note that even a one-page document would start to benefit from these changes. Another benefit of having the data available/cached in the API is that unless the document stats change during parsing, repeated `PDFDocumentProxy.stats`-calls will return *the same identical* object. This is something that we can easily take advantage of in the default viewer, by now *only* reporting "documentStats" telemetry[3] when the data actually have changed rather than once per rendered page (again beneficial in longer documents). --- [1] Furthermore, the maximium number of `StreamType`/`FontType` are `10` respectively `12`, which means that regardless of the complexity and page count in a PDF document there'll never be more than twenty-two "DocStats" messages sent; see https://github.com/mozilla/pdf.js/blob/41ac3f0c07128bf34baccdcc067a108c712fd6ef/src/shared/util.js#L206-L232 [2] One example is the `pdf.pdf` document in the test-suite, where rendering all of its 1310 pages only result in a total of seven "DocStats" messages being sent from the worker-thread. [3] Reporting telemetry, in Firefox, includes using `JSON.stringify` on the data and then sending an event to the `PdfStreamConverter.jsm`-code. In that code the event is handled and `JSON.parse` is used to retrieve the data, and in the "documentStats"-case we'll then iterate through the data to avoid double-reporting telemetry; see https://searchfox.org/mozilla-central/rev/8f4c180b87e52f3345ef8a3432d6e54bd1eb18dc/toolkit/components/pdfjs/content/PdfStreamConverter.jsm#515-549
2021-11-12 02:14:26 +09:00
import { Lexer, Parser } from "./parser.js";
import { BaseStream } from "./base_stream.js";
import { CipherTransformFactory } from "./crypto.js";
2021-04-14 01:26:12 +09:00
class XRef {
constructor(stream, pdfManager) {
this.stream = stream;
this.pdfManager = pdfManager;
this.entries = [];
this.xrefstms = Object.create(null);
this._cacheMap = new Map(); // Prepare the XRef cache.
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
this._pendingRefs = new RefSet();
[api-minor] Replace `PDFDocumentProxy.getStats` with a synchronous `PDFDocumentProxy.stats` getter *Please note:* These changes will primarily benefit longer documents, somewhat at the expense of e.g. one-page documents. The existing `PDFDocumentProxy.getStats` function, which in the default viewer is called for each rendered page, requires a round-trip to the worker-thread in order to obtain the current document stats. In the default viewer, we currently make one such API-call for *every rendered* page. This patch proposes replacing that method with a *synchronous* `PDFDocumentProxy.stats` getter instead, combined with re-factoring the worker-thread code by adding a `DocStats`-class to track Stream/Font-types and *only send* them to the main-thread *the first time* that a type is encountered. Note that in practice most PDF documents only use a fairly limited number of Stream/Font-types, which means that in longer documents most of the `PDFDocumentProxy.getStats`-calls will return the same data.[1] This re-factoring will obviously benefit longer document the most[2], and could actually be seen as a regression for one-page documents, since in practice there'll usually be a couple of "DocStats" messages sent during the parsing of the first page. However, if the user zooms/rotates the document (which causes re-rendering), note that even a one-page document would start to benefit from these changes. Another benefit of having the data available/cached in the API is that unless the document stats change during parsing, repeated `PDFDocumentProxy.stats`-calls will return *the same identical* object. This is something that we can easily take advantage of in the default viewer, by now *only* reporting "documentStats" telemetry[3] when the data actually have changed rather than once per rendered page (again beneficial in longer documents). --- [1] Furthermore, the maximium number of `StreamType`/`FontType` are `10` respectively `12`, which means that regardless of the complexity and page count in a PDF document there'll never be more than twenty-two "DocStats" messages sent; see https://github.com/mozilla/pdf.js/blob/41ac3f0c07128bf34baccdcc067a108c712fd6ef/src/shared/util.js#L206-L232 [2] One example is the `pdf.pdf` document in the test-suite, where rendering all of its 1310 pages only result in a total of seven "DocStats" messages being sent from the worker-thread. [3] Reporting telemetry, in Firefox, includes using `JSON.stringify` on the data and then sending an event to the `PdfStreamConverter.jsm`-code. In that code the event is handled and `JSON.parse` is used to retrieve the data, and in the "documentStats"-case we'll then iterate through the data to avoid double-reporting telemetry; see https://searchfox.org/mozilla-central/rev/8f4c180b87e52f3345ef8a3432d6e54bd1eb18dc/toolkit/components/pdfjs/content/PdfStreamConverter.jsm#515-549
2021-11-12 02:14:26 +09:00
this.stats = new DocStats(pdfManager.msgHandler);
this._newRefNum = null;
}
2021-04-14 01:26:12 +09:00
getNewRef() {
if (this._newRefNum === null) {
this._newRefNum = this.entries.length || 1;
2021-04-14 01:26:12 +09:00
}
return Ref.get(this._newRefNum++, 0);
}
2021-04-14 01:26:12 +09:00
resetNewRef() {
this._newRefNum = null;
}
2021-04-14 01:26:12 +09:00
setStartXRef(startXRef) {
// Store the starting positions of xref tables as we process them
// so we can recover from missing data errors
this.startXRefQueue = [startXRef];
}
2021-04-14 01:26:12 +09:00
parse(recoveryMode = false) {
let trailerDict;
2021-04-14 01:26:12 +09:00
if (!recoveryMode) {
trailerDict = this.readXRef();
} else {
warn("Indexing all PDF objects");
trailerDict = this.indexObjects();
}
trailerDict.assignXref(this);
this.trailer = trailerDict;
let encrypt;
try {
encrypt = trailerDict.get("Encrypt");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn(`XRef.parse - Invalid "Encrypt" reference: "${ex}".`);
}
if (encrypt instanceof Dict) {
const ids = trailerDict.get("ID");
const fileId = ids && ids.length ? ids[0] : "";
2021-04-14 01:26:12 +09:00
// The 'Encrypt' dictionary itself should not be encrypted, and by
// setting `suppressEncryption` we can prevent an infinite loop inside
// of `XRef_fetchUncompressed` if the dictionary contains indirect
// objects (fixes issue7665.pdf).
encrypt.suppressEncryption = true;
this.encrypt = new CipherTransformFactory(
encrypt,
fileId,
this.pdfManager.password
);
}
// Get the root dictionary (catalog) object, and do some basic validation.
let root;
try {
root = trailerDict.get("Root");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn(`XRef.parse - Invalid "Root" reference: "${ex}".`);
}
2021-12-03 00:40:31 +09:00
if (root instanceof Dict) {
try {
const pages = root.get("Pages");
if (pages instanceof Dict) {
this.root = root;
return;
}
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn(`XRef.parse - Invalid "Pages" reference: "${ex}".`);
}
2021-04-14 01:26:12 +09:00
}
2021-12-03 00:40:31 +09:00
if (!recoveryMode) {
throw new XRefParseException();
}
// Even recovery failed, there's nothing more we can do here.
throw new InvalidPDFException("Invalid Root reference.");
2021-04-14 01:26:12 +09:00
}
2021-04-14 01:26:12 +09:00
processXRefTable(parser) {
if (!("tableState" in this)) {
// Stores state of the table as we process it so we can resume
// from middle of table in case of missing data error
this.tableState = {
entryNum: 0,
streamPos: parser.lexer.stream.pos,
parserBuf1: parser.buf1,
parserBuf2: parser.buf2,
};
}
const obj = this.readXRefTable(parser);
2021-04-14 01:26:12 +09:00
// Sanity check
if (!isCmd(obj, "trailer")) {
throw new FormatError(
"Invalid XRef table: could not find trailer dictionary"
);
}
// Read trailer dictionary, e.g.
// trailer
// << /Size 22
// /Root 20R
// /Info 10R
// /ID [ <81b14aafa313db63dbd6f981e49f94f4> ]
// >>
// The parser goes through the entire stream << ... >> and provides
// a getter interface for the key-value table
let dict = parser.getObj();
2021-04-14 01:26:12 +09:00
// The pdflib PDF generator can generate a nested trailer dictionary
if (!(dict instanceof Dict) && dict.dict) {
2021-04-14 01:26:12 +09:00
dict = dict.dict;
}
if (!(dict instanceof Dict)) {
2021-04-14 01:26:12 +09:00
throw new FormatError(
"Invalid XRef table: could not parse trailer dictionary"
);
}
delete this.tableState;
2021-04-14 01:26:12 +09:00
return dict;
}
readXRefTable(parser) {
// Example of cross-reference table:
// xref
// 0 1 <-- subsection header (first obj #, obj count)
// 0000000000 65535 f <-- actual object (offset, generation #, f/n)
// 23 2 <-- subsection header ... and so on ...
// 0000025518 00002 n
// 0000025635 00000 n
// trailer
// ...
const stream = parser.lexer.stream;
const tableState = this.tableState;
2021-04-14 01:26:12 +09:00
stream.pos = tableState.streamPos;
parser.buf1 = tableState.parserBuf1;
parser.buf2 = tableState.parserBuf2;
// Outer loop is over subsection headers
let obj;
2021-04-14 01:26:12 +09:00
while (true) {
if (!("firstEntryNum" in tableState) || !("entryCount" in tableState)) {
if (isCmd((obj = parser.getObj()), "trailer")) {
break;
}
2021-04-14 01:26:12 +09:00
tableState.firstEntryNum = obj;
tableState.entryCount = parser.getObj();
}
let first = tableState.firstEntryNum;
const count = tableState.entryCount;
2021-04-14 01:26:12 +09:00
if (!Number.isInteger(first) || !Number.isInteger(count)) {
throw new FormatError(
2021-04-14 01:26:12 +09:00
"Invalid XRef table: wrong types in subsection header"
);
}
2021-04-14 01:26:12 +09:00
// Inner loop is over objects themselves
for (let i = tableState.entryNum; i < count; i++) {
2021-04-14 01:26:12 +09:00
tableState.streamPos = stream.pos;
tableState.entryNum = i;
tableState.parserBuf1 = parser.buf1;
tableState.parserBuf2 = parser.buf2;
const entry = {};
2021-04-14 01:26:12 +09:00
entry.offset = parser.getObj();
entry.gen = parser.getObj();
const type = parser.getObj();
2021-04-14 01:26:12 +09:00
if (type instanceof Cmd) {
switch (type.cmd) {
case "f":
entry.free = true;
break;
case "n":
entry.uncompressed = true;
break;
}
}
2021-04-14 01:26:12 +09:00
// Validate entry obj
if (
!Number.isInteger(entry.offset) ||
!Number.isInteger(entry.gen) ||
!(entry.free || entry.uncompressed)
) {
throw new FormatError(
2021-04-14 01:26:12 +09:00
`Invalid entry in XRef subsection: ${first}, ${count}`
);
}
2021-04-14 01:26:12 +09:00
// The first xref table entry, i.e. obj 0, should be free. Attempting
// to adjust an incorrect first obj # (fixes issue 3248 and 7229).
if (i === 0 && entry.free && first === 1) {
first = 0;
}
2021-04-14 01:26:12 +09:00
if (!this.entries[i + first]) {
this.entries[i + first] = entry;
}
}
2021-04-14 01:26:12 +09:00
tableState.entryNum = 0;
tableState.streamPos = stream.pos;
tableState.parserBuf1 = parser.buf1;
tableState.parserBuf2 = parser.buf2;
delete tableState.firstEntryNum;
delete tableState.entryCount;
}
2021-04-14 01:26:12 +09:00
// Sanity check: as per spec, first object must be free
if (this.entries[0] && !this.entries[0].free) {
throw new FormatError("Invalid XRef table: unexpected first object");
}
return obj;
}
processXRefStream(stream) {
if (!("streamState" in this)) {
// Stores state of the stream as we process it so we can resume
// from middle of stream in case of missing data error
const streamParameters = stream.dict;
const byteWidths = streamParameters.get("W");
let range = streamParameters.get("Index");
2021-04-14 01:26:12 +09:00
if (!range) {
range = [0, streamParameters.get("Size")];
}
this.streamState = {
entryRanges: range,
byteWidths,
entryNum: 0,
streamPos: stream.pos,
};
}
this.readXRefStream(stream);
delete this.streamState;
return stream.dict;
}
2021-04-14 01:26:12 +09:00
readXRefStream(stream) {
const streamState = this.streamState;
2021-04-14 01:26:12 +09:00
stream.pos = streamState.streamPos;
const [typeFieldWidth, offsetFieldWidth, generationFieldWidth] =
streamState.byteWidths;
const entryRanges = streamState.entryRanges;
2021-04-14 01:26:12 +09:00
while (entryRanges.length > 0) {
const [first, n] = entryRanges;
2021-04-14 01:26:12 +09:00
if (!Number.isInteger(first) || !Number.isInteger(n)) {
throw new FormatError(`Invalid XRef range fields: ${first}, ${n}`);
}
if (
!Number.isInteger(typeFieldWidth) ||
!Number.isInteger(offsetFieldWidth) ||
!Number.isInteger(generationFieldWidth)
) {
throw new FormatError(
`Invalid XRef entry fields length: ${first}, ${n}`
);
}
for (let i = streamState.entryNum; i < n; ++i) {
2021-04-14 01:26:12 +09:00
streamState.entryNum = i;
streamState.streamPos = stream.pos;
let type = 0,
2021-04-14 01:26:12 +09:00
offset = 0,
generation = 0;
for (let j = 0; j < typeFieldWidth; ++j) {
const typeByte = stream.getByte();
if (typeByte === -1) {
throw new FormatError("Invalid XRef byteWidths 'type'.");
}
type = (type << 8) | typeByte;
}
2021-04-14 01:26:12 +09:00
// if type field is absent, its default value is 1
if (typeFieldWidth === 0) {
type = 1;
}
for (let j = 0; j < offsetFieldWidth; ++j) {
const offsetByte = stream.getByte();
if (offsetByte === -1) {
throw new FormatError("Invalid XRef byteWidths 'offset'.");
}
offset = (offset << 8) | offsetByte;
}
for (let j = 0; j < generationFieldWidth; ++j) {
const generationByte = stream.getByte();
if (generationByte === -1) {
throw new FormatError("Invalid XRef byteWidths 'generation'.");
}
generation = (generation << 8) | generationByte;
2021-04-14 01:26:12 +09:00
}
const entry = {};
2021-04-14 01:26:12 +09:00
entry.offset = offset;
entry.gen = generation;
switch (type) {
case 0:
entry.free = true;
break;
2021-04-14 01:26:12 +09:00
case 1:
entry.uncompressed = true;
break;
case 2:
break;
default:
throw new FormatError(`Invalid XRef entry type: ${type}`);
}
2021-04-14 01:26:12 +09:00
if (!this.entries[first + i]) {
this.entries[first + i] = entry;
}
}
2021-04-14 01:26:12 +09:00
streamState.entryNum = 0;
streamState.streamPos = stream.pos;
entryRanges.splice(0, 2);
}
}
indexObjects() {
// Simple scan through the PDF content to find objects,
// trailers and XRef streams.
const TAB = 0x9,
2021-04-14 01:26:12 +09:00
LF = 0xa,
CR = 0xd,
SPACE = 0x20;
const PERCENT = 0x25,
2021-04-14 01:26:12 +09:00
LT = 0x3c;
function readToken(data, offset) {
let token = "",
2021-04-14 01:26:12 +09:00
ch = data[offset];
while (ch !== LF && ch !== CR && ch !== LT) {
if (++offset >= data.length) {
break;
}
2021-04-14 01:26:12 +09:00
token += String.fromCharCode(ch);
ch = data[offset];
}
return token;
}
function skipUntil(data, offset, what) {
const length = what.length,
2021-04-14 01:26:12 +09:00
dataLength = data.length;
let skipped = 0;
2021-04-14 01:26:12 +09:00
// finding byte sequence
while (offset < dataLength) {
let i = 0;
2021-04-14 01:26:12 +09:00
while (i < length && data[offset + i] === what[i]) {
++i;
}
2021-04-14 01:26:12 +09:00
if (i >= length) {
break; // sequence found
}
offset++;
skipped++;
}
return skipped;
}
const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
2021-04-14 01:26:12 +09:00
const endobjRegExp = /\bendobj[\b\s]$/;
const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s<])$/;
const CHECK_CONTENT_LENGTH = 25;
const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
const startxrefBytes = new Uint8Array([
115, 116, 97, 114, 116, 120, 114, 101, 102,
]);
2021-04-14 01:26:12 +09:00
const objBytes = new Uint8Array([111, 98, 106]);
const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
2021-04-14 01:26:12 +09:00
// Clear out any existing entries, since they may be bogus.
this.entries.length = 0;
2021-12-03 00:40:31 +09:00
this._cacheMap.clear();
2021-04-14 01:26:12 +09:00
const stream = this.stream;
2021-04-14 01:26:12 +09:00
stream.pos = 0;
const buffer = stream.getBytes(),
2021-04-14 01:26:12 +09:00
length = buffer.length;
let position = stream.start;
const trailers = [],
2021-04-14 01:26:12 +09:00
xrefStms = [];
while (position < length) {
let ch = buffer[position];
2021-04-14 01:26:12 +09:00
if (ch === TAB || ch === LF || ch === CR || ch === SPACE) {
++position;
continue;
}
if (ch === PERCENT) {
// %-comment
do {
++position;
if (position >= length) {
break;
}
2021-04-14 01:26:12 +09:00
ch = buffer[position];
} while (ch !== LF && ch !== CR);
continue;
}
const token = readToken(buffer, position);
let m;
2021-04-14 01:26:12 +09:00
if (
token.startsWith("xref") &&
(token.length === 4 || /\s/.test(token[4]))
) {
position += skipUntil(buffer, position, trailerBytes);
trailers.push(position);
position += skipUntil(buffer, position, startxrefBytes);
} else if ((m = objRegExp.exec(token))) {
const num = m[1] | 0,
gen = m[2] | 0;
let contentLength,
startPos = position + token.length,
updateEntries = false;
if (!this.entries[num]) {
updateEntries = true;
} else if (this.entries[num].gen === gen) {
// Before overwriting an existing entry, ensure that the new one won't
// cause *immediate* errors when it's accessed (fixes issue13783.pdf).
try {
const parser = new Parser({
lexer: new Lexer(stream.makeSubStream(startPos)),
});
parser.getObj();
updateEntries = true;
} catch (ex) {
if (ex instanceof ParserEOFException) {
warn(`indexObjects -- checking object (${token}): "${ex}".`);
} else {
// The error may come from the `Parser`-instance being initialized
// without an `XRef`-instance (we don't have a usable one yet).
updateEntries = true;
}
}
}
if (updateEntries) {
2021-04-14 01:26:12 +09:00
this.entries[num] = {
offset: position - stream.start,
gen,
uncompressed: true,
};
}
2021-04-14 01:26:12 +09:00
// Find the next "obj" string, rather than "endobj", to ensure that
// we won't skip over a new 'obj' operator in corrupt files where
// 'endobj' operators are missing (fixes issue9105_reduced.pdf).
while (startPos < buffer.length) {
const endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4;
contentLength = endPos - position;
2021-04-14 01:26:12 +09:00
const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos);
const tokenStr = bytesToString(buffer.subarray(checkPos, endPos));
2021-04-14 01:26:12 +09:00
// Check if the current object ends with an 'endobj' operator.
if (endobjRegExp.test(tokenStr)) {
break;
} else {
// Check if an "obj" occurrence is actually a new object,
// i.e. the current object is missing the 'endobj' operator.
const objToken = nestedObjRegExp.exec(tokenStr);
if (objToken && objToken[1]) {
warn(
'indexObjects: Found new "obj" inside of another "obj", ' +
'caused by missing "endobj" -- trying to recover.'
);
contentLength -= objToken[1].length;
break;
}
}
2021-04-14 01:26:12 +09:00
startPos = endPos;
}
const content = buffer.subarray(position, position + contentLength);
// checking XRef stream suspect
// (it shall have '/XRef' and next char is not a letter)
const xrefTagOffset = skipUntil(content, 0, xrefBytes);
2021-04-14 01:26:12 +09:00
if (xrefTagOffset < contentLength && content[xrefTagOffset + 5] < 64) {
xrefStms.push(position - stream.start);
this.xrefstms[position - stream.start] = 1; // Avoid recursion
}
2021-04-14 01:26:12 +09:00
position += contentLength;
} else if (
token.startsWith("trailer") &&
(token.length === 7 || /\s/.test(token[7]))
) {
trailers.push(position);
position += skipUntil(buffer, position, startxrefBytes);
} else {
position += token.length + 1;
}
}
// reading XRef streams
for (let i = 0, ii = xrefStms.length; i < ii; ++i) {
this.startXRefQueue.push(xrefStms[i]);
this.readXRef(/* recoveryMode */ true);
}
// finding main trailer
let trailerDict;
for (let i = 0, ii = trailers.length; i < ii; ++i) {
stream.pos = trailers[i];
const parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
recoveryMode: true,
});
const obj = parser.getObj();
2021-04-14 01:26:12 +09:00
if (!isCmd(obj, "trailer")) {
continue;
}
2021-04-14 01:26:12 +09:00
// read the trailer dictionary
const dict = parser.getObj();
if (!(dict instanceof Dict)) {
2021-04-14 01:26:12 +09:00
continue;
}
2021-04-14 01:26:12 +09:00
// Do some basic validation of the trailer/root dictionary candidate.
try {
const rootDict = dict.get("Root");
if (!(rootDict instanceof Dict)) {
continue;
}
2021-04-14 01:26:12 +09:00
const pagesDict = rootDict.get("Pages");
if (!(pagesDict instanceof Dict)) {
continue;
}
2021-04-14 01:26:12 +09:00
const pagesCount = pagesDict.get("Count");
if (!Number.isInteger(pagesCount)) {
continue;
}
2021-04-14 01:26:12 +09:00
// The top-level /Pages dictionary isn't obviously corrupt.
} catch (ex) {
continue;
}
// taking the first one with 'ID'
if (dict.has("ID")) {
return dict;
}
// The current dictionary is a candidate, but continue searching.
trailerDict = dict;
}
// No trailer with 'ID', taking last one (if exists).
if (trailerDict) {
return trailerDict;
}
// No trailer dictionary found, taking the "top"-dictionary (if exists).
if (this.topDict) {
return this.topDict;
}
2021-04-14 01:26:12 +09:00
// nothing helps
throw new InvalidPDFException("Invalid PDF structure.");
}
2021-04-14 01:26:12 +09:00
readXRef(recoveryMode = false) {
const stream = this.stream;
2021-04-14 01:26:12 +09:00
// Keep track of already parsed XRef tables, to prevent an infinite loop
// when parsing corrupt PDF files where e.g. the /Prev entries create a
// circular dependency between tables (fixes bug1393476.pdf).
const startXRefParsedCache = new Set();
2021-04-14 01:26:12 +09:00
try {
while (this.startXRefQueue.length) {
const startXRef = this.startXRefQueue[0];
2021-04-14 01:26:12 +09:00
if (startXRefParsedCache.has(startXRef)) {
warn("readXRef - skipping XRef table since it was already parsed.");
this.startXRefQueue.shift();
continue;
}
startXRefParsedCache.add(startXRef);
2021-04-14 01:26:12 +09:00
stream.pos = startXRef + stream.start;
const parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
});
let obj = parser.getObj();
let dict;
2021-04-14 01:26:12 +09:00
// Get dictionary
if (isCmd(obj, "xref")) {
// Parse end-of-file XRef
dict = this.processXRefTable(parser);
if (!this.topDict) {
this.topDict = dict;
}
2021-04-14 01:26:12 +09:00
// Recursively get other XRefs 'XRefStm', if any
obj = dict.get("XRefStm");
if (Number.isInteger(obj)) {
const pos = obj;
2021-04-14 01:26:12 +09:00
// ignore previously loaded xref streams
// (possible infinite recursion)
if (!(pos in this.xrefstms)) {
this.xrefstms[pos] = 1;
this.startXRefQueue.push(pos);
}
}
2021-04-14 01:26:12 +09:00
} else if (Number.isInteger(obj)) {
// Parse in-stream XRef
if (
!Number.isInteger(parser.getObj()) ||
!isCmd(parser.getObj(), "obj") ||
!((obj = parser.getObj()) instanceof BaseStream)
2021-04-14 01:26:12 +09:00
) {
throw new FormatError("Invalid XRef stream");
}
dict = this.processXRefStream(obj);
if (!this.topDict) {
this.topDict = dict;
}
if (!dict) {
throw new FormatError("Failed to read XRef stream");
}
} else {
throw new FormatError("Invalid XRef stream header");
}
2021-04-14 01:26:12 +09:00
// Recursively get previous dictionary, if any
obj = dict.get("Prev");
if (Number.isInteger(obj)) {
this.startXRefQueue.push(obj);
} else if (obj instanceof Ref) {
2021-04-14 01:26:12 +09:00
// The spec says Prev must not be a reference, i.e. "/Prev NNN"
// This is a fallback for non-compliant PDFs, i.e. "/Prev NNN 0 R"
this.startXRefQueue.push(obj.num);
}
2021-04-14 01:26:12 +09:00
this.startXRefQueue.shift();
}
2021-04-14 01:26:12 +09:00
return this.topDict;
} catch (e) {
if (e instanceof MissingDataException) {
throw e;
}
2021-04-14 01:26:12 +09:00
info("(while reading XRef): " + e);
this.startXRefQueue.shift();
2021-04-14 01:26:12 +09:00
}
2021-04-14 01:26:12 +09:00
if (recoveryMode) {
return undefined;
}
throw new XRefParseException();
}
2021-04-14 01:26:12 +09:00
getEntry(i) {
const xrefEntry = this.entries[i];
2021-04-14 01:26:12 +09:00
if (xrefEntry && !xrefEntry.free && xrefEntry.offset) {
return xrefEntry;
}
return null;
}
2021-04-14 01:26:12 +09:00
fetchIfRef(obj, suppressEncryption = false) {
if (obj instanceof Ref) {
return this.fetch(obj, suppressEncryption);
}
return obj;
}
2021-04-14 01:26:12 +09:00
fetch(ref, suppressEncryption = false) {
if (!(ref instanceof Ref)) {
throw new Error("ref object is not a reference");
}
const num = ref.num;
// The XRef cache is populated with objects which are obtained through
// `Parser.getObj`, and indirectly via `Lexer.getObj`. Neither of these
// methods should ever return `undefined` (note the `assert` calls below).
const cacheEntry = this._cacheMap.get(num);
if (cacheEntry !== undefined) {
// In documents with Object Streams, it's possible that cached `Dict`s
// have not been assigned an `objId` yet (see e.g. issue3115r.pdf).
if (cacheEntry instanceof Dict && !cacheEntry.objId) {
cacheEntry.objId = ref.toString();
}
return cacheEntry;
}
let xrefEntry = this.getEntry(num);
if (xrefEntry === null) {
// The referenced entry can be free.
this._cacheMap.set(num, xrefEntry);
return xrefEntry;
}
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
// Prevent circular references, in corrupt PDF documents, from hanging the
// worker-thread. This relies, implicitly, on the parsing being synchronous.
if (this._pendingRefs.has(ref)) {
this._pendingRefs.remove(ref);
2021-04-14 01:26:12 +09:00
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
warn(`Ignoring circular reference: ${ref}.`);
return CIRCULAR_REF;
}
this._pendingRefs.put(ref);
try {
if (xrefEntry.uncompressed) {
xrefEntry = this.fetchUncompressed(ref, xrefEntry, suppressEncryption);
} else {
xrefEntry = this.fetchCompressed(ref, xrefEntry, suppressEncryption);
}
this._pendingRefs.remove(ref);
} catch (ex) {
this._pendingRefs.remove(ref);
throw ex;
2021-04-14 01:26:12 +09:00
}
if (xrefEntry instanceof Dict) {
2021-04-14 01:26:12 +09:00
xrefEntry.objId = ref.toString();
} else if (xrefEntry instanceof BaseStream) {
2021-04-14 01:26:12 +09:00
xrefEntry.dict.objId = ref.toString();
}
return xrefEntry;
}
fetchUncompressed(ref, xrefEntry, suppressEncryption = false) {
const gen = ref.gen;
let num = ref.num;
2021-04-14 01:26:12 +09:00
if (xrefEntry.gen !== gen) {
throw new XRefEntryException(`Inconsistent generation in XRef: ${ref}`);
}
const stream = this.stream.makeSubStream(
2021-04-14 01:26:12 +09:00
xrefEntry.offset + this.stream.start
);
const parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
});
const obj1 = parser.getObj();
const obj2 = parser.getObj();
const obj3 = parser.getObj();
2021-04-14 01:26:12 +09:00
if (obj1 !== num || obj2 !== gen || !(obj3 instanceof Cmd)) {
throw new XRefEntryException(`Bad (uncompressed) XRef entry: ${ref}`);
}
if (obj3.cmd !== "obj") {
// some bad PDFs use "obj1234" and really mean 1234
if (obj3.cmd.startsWith("obj")) {
num = parseInt(obj3.cmd.substring(3), 10);
if (!Number.isNaN(num)) {
return num;
}
}
2021-04-14 01:26:12 +09:00
throw new XRefEntryException(`Bad (uncompressed) XRef entry: ${ref}`);
}
if (this.encrypt && !suppressEncryption) {
xrefEntry = parser.getObj(this.encrypt.createCipherTransform(num, gen));
} else {
xrefEntry = parser.getObj();
}
if (!(xrefEntry instanceof BaseStream)) {
2021-04-14 01:26:12 +09:00
if (
typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")
) {
assert(
xrefEntry !== undefined,
'fetchUncompressed: The "xrefEntry" cannot be undefined.'
);
}
2021-04-14 01:26:12 +09:00
this._cacheMap.set(num, xrefEntry);
}
return xrefEntry;
}
2021-04-14 01:26:12 +09:00
fetchCompressed(ref, xrefEntry, suppressEncryption = false) {
const tableOffset = xrefEntry.offset;
const stream = this.fetch(Ref.get(tableOffset, 0));
if (!(stream instanceof BaseStream)) {
2021-04-14 01:26:12 +09:00
throw new FormatError("bad ObjStm stream");
}
const first = stream.dict.get("First");
const n = stream.dict.get("N");
if (!Number.isInteger(first) || !Number.isInteger(n)) {
throw new FormatError("invalid first and n parameters for ObjStm stream");
}
let parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
});
const nums = new Array(n);
const offsets = new Array(n);
// read the object numbers to populate cache
for (let i = 0; i < n; ++i) {
const num = parser.getObj();
if (!Number.isInteger(num)) {
throw new FormatError(
`invalid object number in the ObjStm stream: ${num}`
);
}
2021-04-14 01:26:12 +09:00
const offset = parser.getObj();
if (!Number.isInteger(offset)) {
throw new FormatError(
`invalid object offset in the ObjStm stream: ${offset}`
);
}
nums[i] = num;
offsets[i] = offset;
}
const start = (stream.start || 0) + first;
const entries = new Array(n);
// read stream objects for cache
for (let i = 0; i < n; ++i) {
const length = i < n - 1 ? offsets[i + 1] - offsets[i] : undefined;
if (length < 0) {
throw new FormatError("Invalid offset in the ObjStm stream.");
}
parser = new Parser({
lexer: new Lexer(
stream.makeSubStream(start + offsets[i], length, stream.dict)
),
xref: this,
allowStreams: true,
});
2021-04-14 01:26:12 +09:00
const obj = parser.getObj();
entries[i] = obj;
if (obj instanceof BaseStream) {
2021-04-14 01:26:12 +09:00
continue;
}
2021-04-14 01:26:12 +09:00
const num = nums[i],
entry = this.entries[num];
if (entry && entry.offset === tableOffset && entry.gen === i) {
if (
typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")
) {
assert(
2021-04-14 01:26:12 +09:00
obj !== undefined,
'fetchCompressed: The "obj" cannot be undefined.'
);
}
2021-04-14 01:26:12 +09:00
this._cacheMap.set(num, obj);
}
}
xrefEntry = entries[xrefEntry.gen];
if (xrefEntry === undefined) {
throw new XRefEntryException(`Bad (compressed) XRef entry: ${ref}`);
}
return xrefEntry;
}
2021-04-14 01:26:12 +09:00
async fetchIfRefAsync(obj, suppressEncryption) {
if (obj instanceof Ref) {
return this.fetchAsync(obj, suppressEncryption);
}
return obj;
}
2021-04-14 01:26:12 +09:00
async fetchAsync(ref, suppressEncryption) {
try {
return this.fetch(ref, suppressEncryption);
} catch (ex) {
if (!(ex instanceof MissingDataException)) {
throw ex;
}
2021-04-14 01:26:12 +09:00
await this.pdfManager.requestRange(ex.begin, ex.end);
return this.fetchAsync(ref, suppressEncryption);
}
}
2021-04-14 01:26:12 +09:00
getCatalogObj() {
return this.root;
}
}
export { XRef };