pdf.js/src/core/primitives.js

441 lines
11 KiB
JavaScript
Raw Normal View History

/* Copyright 2012 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { assert, shadow, unreachable } from "../shared/util.js";
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
const CIRCULAR_REF = Symbol("CIRCULAR_REF");
const EOF = Symbol("EOF");
const Name = (function NameClosure() {
let nameCache = Object.create(null);
// eslint-disable-next-line no-shadow
class Name {
constructor(name) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
typeof name !== "string"
) {
unreachable('Name: The "name" must be a string.');
}
this.name = name;
}
static get(name) {
// eslint-disable-next-line no-restricted-syntax
return nameCache[name] || (nameCache[name] = new Name(name));
}
static _clearCache() {
nameCache = Object.create(null);
}
}
return Name;
})();
const Cmd = (function CmdClosure() {
let cmdCache = Object.create(null);
// eslint-disable-next-line no-shadow
class Cmd {
constructor(cmd) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
typeof cmd !== "string"
) {
unreachable('Cmd: The "cmd" must be a string.');
}
this.cmd = cmd;
}
static get(cmd) {
// eslint-disable-next-line no-restricted-syntax
return cmdCache[cmd] || (cmdCache[cmd] = new Cmd(cmd));
}
static _clearCache() {
cmdCache = Object.create(null);
}
}
return Cmd;
})();
const nonSerializable = function nonSerializableClosure() {
return nonSerializable; // Creating closure on some variable.
};
class Dict {
constructor(xref = null) {
// Map should only be used internally, use functions below to access.
this._map = Object.create(null);
this.xref = xref;
this.objId = null;
this.suppressEncryption = false;
this.__nonSerializable__ = nonSerializable; // Disable cloning of the Dict.
}
assignXref(newXref) {
this.xref = newXref;
}
get size() {
return Object.keys(this._map).length;
}
// Automatically dereferences Ref objects.
get(key1, key2, key3) {
let value = this._map[key1];
if (value === undefined && key2 !== undefined) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
key2.length < key1.length
) {
unreachable("Dict.get: Expected keys to be ordered by length.");
}
value = this._map[key2];
if (value === undefined && key3 !== undefined) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
key3.length < key2.length
) {
unreachable("Dict.get: Expected keys to be ordered by length.");
}
value = this._map[key3];
}
}
if (value instanceof Ref && this.xref) {
return this.xref.fetch(value, this.suppressEncryption);
}
return value;
}
// Same as get(), but returns a promise and uses fetchIfRefAsync().
async getAsync(key1, key2, key3) {
let value = this._map[key1];
if (value === undefined && key2 !== undefined) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
key2.length < key1.length
) {
unreachable("Dict.getAsync: Expected keys to be ordered by length.");
}
value = this._map[key2];
if (value === undefined && key3 !== undefined) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
key3.length < key2.length
) {
unreachable("Dict.getAsync: Expected keys to be ordered by length.");
}
value = this._map[key3];
}
}
if (value instanceof Ref && this.xref) {
return this.xref.fetchAsync(value, this.suppressEncryption);
}
return value;
}
// Same as get(), but dereferences all elements if the result is an Array.
getArray(key1, key2, key3) {
let value = this._map[key1];
if (value === undefined && key2 !== undefined) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
key2.length < key1.length
) {
unreachable("Dict.getArray: Expected keys to be ordered by length.");
}
value = this._map[key2];
if (value === undefined && key3 !== undefined) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
key3.length < key2.length
) {
unreachable("Dict.getArray: Expected keys to be ordered by length.");
}
value = this._map[key3];
}
}
if (value instanceof Ref && this.xref) {
value = this.xref.fetch(value, this.suppressEncryption);
}
if (Array.isArray(value)) {
value = value.slice(); // Ensure that we don't modify the Dict data.
for (let i = 0, ii = value.length; i < ii; i++) {
if (value[i] instanceof Ref && this.xref) {
value[i] = this.xref.fetch(value[i], this.suppressEncryption);
}
Slightly simplify the lookup of data in `Dict.{get, getAsync, has}` Note that `Dict.set` will only be called with values returned through `Parser.getObj`, and thus indirectly via `Lexer.getObj`. Since neither of those methods will ever return `undefined`, we can simply assert that that's the case when inserting data into the `Dict` and thus get rid of `in` checks when doing the data lookups. In this case, since `Dict.set` is fairly hot, the patch utilizes an *inline check* and when necessary a direct call to `unreachable` to not affect performance of `gulp server/test` too much (rather than always just calling `assert`). For very large and complex PDF files this will help performance *slightly*, since `Dict.{get, getAsync, has}` is called *a lot* during parsing in the worker. This patch was tested using the PDF file from issue 2618, i.e. http://bugzilla-attachments.gnome.org/attachment.cgi?id=226471, with the following manifest file: ``` [ { "id": "issue2618", "file": "../web/pdfs/issue2618.pdf", "md5": "", "rounds": 250, "type": "eq" } ] ``` which gave the following results when comparing this patch against the `master` branch: ``` -- Grouped By browser, stat -- browser | stat | Count | Baseline(ms) | Current(ms) | +/- | % | Result(P<.05) ------- | ------------ | ----- | ------------ | ----------- | --- | ----- | ------------- Firefox | Overall | 250 | 2838 | 2820 | -18 | -0.65 | faster Firefox | Page Request | 250 | 1 | 2 | 0 | 11.92 | slower Firefox | Rendering | 250 | 2837 | 2818 | -19 | -0.65 | faster ```
2020-01-31 19:39:48 +09:00
}
}
return value;
}
// No dereferencing.
getRaw(key) {
return this._map[key];
}
getKeys() {
return Object.keys(this._map);
}
// No dereferencing.
getRawValues() {
return Object.values(this._map);
}
set(key, value) {
if (
typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")
) {
if (typeof key !== "string") {
unreachable('Dict.set: The "key" must be a string.');
} else if (value === undefined) {
unreachable('Dict.set: The "value" cannot be undefined.');
}
}
this._map[key] = value;
}
has(key) {
return this._map[key] !== undefined;
}
forEach(callback) {
for (const key in this._map) {
callback(key, this.get(key));
}
}
static get empty() {
const emptyDict = new Dict(null);
emptyDict.set = (key, value) => {
unreachable("Should not call `set` on the empty dictionary.");
};
return shadow(this, "empty", emptyDict);
}
static merge({ xref, dictArray, mergeSubDicts = false }) {
const mergedDict = new Dict(xref),
properties = new Map();
for (const dict of dictArray) {
if (!(dict instanceof Dict)) {
continue;
}
for (const [key, value] of Object.entries(dict._map)) {
let property = properties.get(key);
if (property === undefined) {
property = [];
properties.set(key, property);
} else if (!mergeSubDicts || !(value instanceof Dict)) {
// Ignore additional entries, if either:
// - This is a "shallow" merge, where only the first element matters.
// - The value is *not* a `Dict`, since other types cannot be merged.
continue;
}
property.push(value);
}
}
for (const [name, values] of properties) {
if (values.length === 1 || !(values[0] instanceof Dict)) {
mergedDict._map[name] = values[0];
continue;
}
const subDict = new Dict(xref);
for (const dict of values) {
for (const [key, value] of Object.entries(dict._map)) {
if (subDict._map[key] === undefined) {
subDict._map[key] = value;
}
}
}
if (subDict.size > 0) {
mergedDict._map[name] = subDict;
}
}
properties.clear();
return mergedDict.size > 0 ? mergedDict : Dict.empty;
}
}
const Ref = (function RefClosure() {
let refCache = Object.create(null);
// eslint-disable-next-line no-shadow
class Ref {
constructor(num, gen) {
this.num = num;
this.gen = gen;
}
toString() {
// This function is hot, so we make the string as compact as possible.
// |this.gen| is almost always zero, so we treat that case specially.
if (this.gen === 0) {
return `${this.num}R`;
}
return `${this.num}R${this.gen}`;
}
static get(num, gen) {
const key = gen === 0 ? `${num}R` : `${num}R${gen}`;
// eslint-disable-next-line no-restricted-syntax
return refCache[key] || (refCache[key] = new Ref(num, gen));
}
static _clearCache() {
refCache = Object.create(null);
}
}
return Ref;
})();
// The reference is identified by number and generation.
// This structure stores only one instance of the reference.
class RefSet {
Add global caching, for /Resources without blend modes, and use it to reduce repeated fetching/parsing in `PartialEvaluator.hasBlendModes` The `PartialEvaluator.hasBlendModes` method is necessary to determine if there's any blend modes on a page, which unfortunately requires *synchronous* parsing of the /Resources of each page before its rendering can start (see the "StartRenderPage"-message). In practice it's not uncommon for certain /Resources-entries to be found on more than one page (referenced via the XRef-table), which thus leads to unnecessary re-fetching/re-parsing of data in `PartialEvaluator.hasBlendModes`. To improve performance, especially in pathological cases, we can cache /Resources-entries when it's absolutely clear that they do not contain *any* blend modes at all[1]. This way, subsequent `PartialEvaluator.hasBlendModes` calls can be made significantly more efficient. This patch was tested using the PDF file from issue 6961, i.e. https://github.com/mozilla/pdf.js/files/121712/test.pdf: ``` [ { "id": "issue6961", "file": "../web/pdfs/issue6961.pdf", "md5": "a80e4357a8fda758d96c2c76f2980b03", "rounds": 100, "type": "eq" } ] ``` which gave the following results when comparing this patch against the `master` branch: ``` -- Grouped By browser, page, stat -- browser | page | stat | Count | Baseline(ms) | Current(ms) | +/- | % | Result(P<.05) ------- | ---- | ------------ | ----- | ------------ | ----------- | ---- | ------ | ------------- firefox | 0 | Overall | 100 | 1034 | 555 | -480 | -46.39 | faster firefox | 0 | Page Request | 100 | 489 | 7 | -482 | -98.67 | faster firefox | 0 | Rendering | 100 | 545 | 548 | 2 | 0.45 | firefox | 1 | Overall | 100 | 912 | 428 | -484 | -53.06 | faster firefox | 1 | Page Request | 100 | 487 | 1 | -486 | -99.77 | faster firefox | 1 | Rendering | 100 | 425 | 427 | 2 | 0.51 | ``` --- [1] In the case where blend modes *are* found, it becomes a lot more difficult to know if it's generally safe to skip /Resources-entries. Hence we don't cache anything in that case, however note that most document/pages do not utilize blend modes anyway.
2020-11-05 21:35:33 +09:00
constructor(parent = null) {
if (
(typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")) &&
parent &&
!(parent instanceof RefSet)
) {
unreachable('RefSet: Invalid "parent" value.');
}
this._set = new Set(parent && parent._set);
}
has(ref) {
return this._set.has(ref.toString());
}
put(ref) {
this._set.add(ref.toString());
}
remove(ref) {
this._set.delete(ref.toString());
}
Add global caching, for /Resources without blend modes, and use it to reduce repeated fetching/parsing in `PartialEvaluator.hasBlendModes` The `PartialEvaluator.hasBlendModes` method is necessary to determine if there's any blend modes on a page, which unfortunately requires *synchronous* parsing of the /Resources of each page before its rendering can start (see the "StartRenderPage"-message). In practice it's not uncommon for certain /Resources-entries to be found on more than one page (referenced via the XRef-table), which thus leads to unnecessary re-fetching/re-parsing of data in `PartialEvaluator.hasBlendModes`. To improve performance, especially in pathological cases, we can cache /Resources-entries when it's absolutely clear that they do not contain *any* blend modes at all[1]. This way, subsequent `PartialEvaluator.hasBlendModes` calls can be made significantly more efficient. This patch was tested using the PDF file from issue 6961, i.e. https://github.com/mozilla/pdf.js/files/121712/test.pdf: ``` [ { "id": "issue6961", "file": "../web/pdfs/issue6961.pdf", "md5": "a80e4357a8fda758d96c2c76f2980b03", "rounds": 100, "type": "eq" } ] ``` which gave the following results when comparing this patch against the `master` branch: ``` -- Grouped By browser, page, stat -- browser | page | stat | Count | Baseline(ms) | Current(ms) | +/- | % | Result(P<.05) ------- | ---- | ------------ | ----- | ------------ | ----------- | ---- | ------ | ------------- firefox | 0 | Overall | 100 | 1034 | 555 | -480 | -46.39 | faster firefox | 0 | Page Request | 100 | 489 | 7 | -482 | -98.67 | faster firefox | 0 | Rendering | 100 | 545 | 548 | 2 | 0.45 | firefox | 1 | Overall | 100 | 912 | 428 | -484 | -53.06 | faster firefox | 1 | Page Request | 100 | 487 | 1 | -486 | -99.77 | faster firefox | 1 | Rendering | 100 | 425 | 427 | 2 | 0.51 | ``` --- [1] In the case where blend modes *are* found, it becomes a lot more difficult to know if it's generally safe to skip /Resources-entries. Hence we don't cache anything in that case, however note that most document/pages do not utilize blend modes anyway.
2020-11-05 21:35:33 +09:00
[Symbol.iterator]() {
return this._set.values();
Add global caching, for /Resources without blend modes, and use it to reduce repeated fetching/parsing in `PartialEvaluator.hasBlendModes` The `PartialEvaluator.hasBlendModes` method is necessary to determine if there's any blend modes on a page, which unfortunately requires *synchronous* parsing of the /Resources of each page before its rendering can start (see the "StartRenderPage"-message). In practice it's not uncommon for certain /Resources-entries to be found on more than one page (referenced via the XRef-table), which thus leads to unnecessary re-fetching/re-parsing of data in `PartialEvaluator.hasBlendModes`. To improve performance, especially in pathological cases, we can cache /Resources-entries when it's absolutely clear that they do not contain *any* blend modes at all[1]. This way, subsequent `PartialEvaluator.hasBlendModes` calls can be made significantly more efficient. This patch was tested using the PDF file from issue 6961, i.e. https://github.com/mozilla/pdf.js/files/121712/test.pdf: ``` [ { "id": "issue6961", "file": "../web/pdfs/issue6961.pdf", "md5": "a80e4357a8fda758d96c2c76f2980b03", "rounds": 100, "type": "eq" } ] ``` which gave the following results when comparing this patch against the `master` branch: ``` -- Grouped By browser, page, stat -- browser | page | stat | Count | Baseline(ms) | Current(ms) | +/- | % | Result(P<.05) ------- | ---- | ------------ | ----- | ------------ | ----------- | ---- | ------ | ------------- firefox | 0 | Overall | 100 | 1034 | 555 | -480 | -46.39 | faster firefox | 0 | Page Request | 100 | 489 | 7 | -482 | -98.67 | faster firefox | 0 | Rendering | 100 | 545 | 548 | 2 | 0.45 | firefox | 1 | Overall | 100 | 912 | 428 | -484 | -53.06 | faster firefox | 1 | Page Request | 100 | 487 | 1 | -486 | -99.77 | faster firefox | 1 | Rendering | 100 | 425 | 427 | 2 | 0.51 | ``` --- [1] In the case where blend modes *are* found, it becomes a lot more difficult to know if it's generally safe to skip /Resources-entries. Hence we don't cache anything in that case, however note that most document/pages do not utilize blend modes anyway.
2020-11-05 21:35:33 +09:00
}
clear() {
this._set.clear();
}
}
class RefSetCache {
constructor() {
this._map = new Map();
}
get size() {
return this._map.size;
}
get(ref) {
return this._map.get(ref.toString());
}
has(ref) {
return this._map.has(ref.toString());
}
put(ref, obj) {
this._map.set(ref.toString(), obj);
}
putAlias(ref, aliasRef) {
this._map.set(ref.toString(), this.get(aliasRef));
}
[Symbol.iterator]() {
return this._map.values();
}
clear() {
this._map.clear();
}
}
function isName(v, name) {
return v instanceof Name && (name === undefined || v.name === name);
}
function isCmd(v, cmd) {
return v instanceof Cmd && (cmd === undefined || v.cmd === cmd);
}
function isDict(v, type) {
Enable auto-formatting of the entire code-base using Prettier (issue 11444) Note that Prettier, purposely, has only limited [configuration options](https://prettier.io/docs/en/options.html). The configuration file is based on [the one in `mozilla central`](https://searchfox.org/mozilla-central/source/.prettierrc) with just a few additions (to avoid future breakage if the defaults ever changes). Prettier is being used for a couple of reasons: - To be consistent with `mozilla-central`, where Prettier is already in use across the tree. - To ensure a *consistent* coding style everywhere, which is automatically enforced during linting (since Prettier is used as an ESLint plugin). This thus ends "all" formatting disussions once and for all, removing the need for review comments on most stylistic matters. Many ESLint options are now redundant, and I've tried my best to remove all the now unnecessary options (but I may have missed some). Note also that since Prettier considers the `printWidth` option as a guide, rather than a hard rule, this patch resorts to a small hack in the ESLint config to ensure that *comments* won't become too long. *Please note:* This patch is generated automatically, by appending the `--fix` argument to the ESLint call used in the `gulp lint` task. It will thus require some additional clean-up, which will be done in a *separate* commit. (On a more personal note, I'll readily admit that some of the changes Prettier makes are *extremely* ugly. However, in the name of consistency we'll probably have to live with that.)
2019-12-25 23:59:37 +09:00
return (
v instanceof Dict && (type === undefined || isName(v.get("Type"), type))
);
}
function isRefsEqual(v1, v2) {
Enable auto-formatting of the entire code-base using Prettier (issue 11444) Note that Prettier, purposely, has only limited [configuration options](https://prettier.io/docs/en/options.html). The configuration file is based on [the one in `mozilla central`](https://searchfox.org/mozilla-central/source/.prettierrc) with just a few additions (to avoid future breakage if the defaults ever changes). Prettier is being used for a couple of reasons: - To be consistent with `mozilla-central`, where Prettier is already in use across the tree. - To ensure a *consistent* coding style everywhere, which is automatically enforced during linting (since Prettier is used as an ESLint plugin). This thus ends "all" formatting disussions once and for all, removing the need for review comments on most stylistic matters. Many ESLint options are now redundant, and I've tried my best to remove all the now unnecessary options (but I may have missed some). Note also that since Prettier considers the `printWidth` option as a guide, rather than a hard rule, this patch resorts to a small hack in the ESLint config to ensure that *comments* won't become too long. *Please note:* This patch is generated automatically, by appending the `--fix` argument to the ESLint call used in the `gulp lint` task. It will thus require some additional clean-up, which will be done in a *separate* commit. (On a more personal note, I'll readily admit that some of the changes Prettier makes are *extremely* ugly. However, in the name of consistency we'll probably have to live with that.)
2019-12-25 23:59:37 +09:00
if (
typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")
) {
assert(
v1 instanceof Ref && v2 instanceof Ref,
"isRefsEqual: Both parameters should be `Ref`s."
);
}
return v1.num === v2.num && v1.gen === v2.gen;
}
function clearPrimitiveCaches() {
Cmd._clearCache();
Name._clearCache();
Ref._clearCache();
}
export {
Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents.
2021-11-26 22:11:39 +09:00
CIRCULAR_REF,
clearPrimitiveCaches,
Cmd,
Dict,
EOF,
isCmd,
isDict,
isName,
isRefsEqual,
Name,
Ref,
RefSet,
RefSetCache,
};