From d62c9181bd2534278bface643c363d245574af07 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sat, 23 May 2020 13:55:31 +0200 Subject: [PATCH 1/2] Improve the *local* image caching in `PartialEvaluator.getOperatorList` Currently the local `imageCache`, as used in `PartialEvaluator.getOperatorList`, will miss certain cases of repeated images because the caching is *only* done by name (usually using a format such as e.g. "Im0", "Im1", ...). However, in some PDF documents the `/XObject` dictionaries many contain hundreds (or even thousands) of distinctly named images, despite them referring to only a handful of actual image objects (via the XRef table). With these changes we'll now cache *local* images using both name and (where applicable) reference, thus improving re-usage of images resources even further. This patch was tested using the PDF file from [bug 857031](https://bugzilla.mozilla.org/show_bug.cgi?id=857031), i.e. https://bug857031.bmoattachments.org/attachment.cgi?id=732270, with the following manifest file: ``` [ { "id": "bug857031", "file": "../web/pdfs/bug857031.pdf", "md5": "", "rounds": 250, "lastPage": 1, "type": "eq" } ] ``` which gave the following results when comparing this patch against the `master` branch: ``` -- Grouped By browser, page, stat -- browser | page | stat | Count | Baseline(ms) | Current(ms) | +/- | % | Result(P<.05) ------- | ---- | ------------ | ----- | ------------ | ----------- | --- | ----- | ------------- firefox | 0 | Overall | 250 | 2749 | 2656 | -93 | -3.38 | faster firefox | 0 | Page Request | 250 | 3 | 4 | 1 | 50.14 | slower firefox | 0 | Rendering | 250 | 2746 | 2652 | -94 | -3.44 | faster ``` While this is certainly an improvement, since we now avoid re-parsing ~1000 images on the first page, all of the image resources are small enough that the total rendering time doesn't improve that much in this particular case. In pathological cases, such as e.g. the PDF document in issue 4958, the improvements with this patch can be very significant. Looking for example at page 2, from issue 4958, the rendering time drops from ~60 seconds with `master` to ~30 seconds with this patch (obviously still slow, but it really showcases the potential of this patch nicely). Finally, note that there's also potential for additional improvements by re-using `LocalImageCache` instances for e.g. /XObject data of the `Form`-type. However, given that recent changes in this area I purposely didn't want to complicate *this* patch more than necessary. --- src/core/evaluator.js | 44 ++++++++++++++++++++++++++--------------- src/core/image_utils.js | 41 +++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 4a307f142..05c287f0e 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -79,6 +79,7 @@ import { DecodeStream } from "./stream.js"; import { getGlyphsUnicode } from "./glyphlist.js"; import { getMetrics } from "./metrics.js"; import { isPDFFunction } from "./function.js"; +import { LocalImageCache } from "./image_utils.js"; import { MurmurHash3_64 } from "./murmurhash3.js"; import { OperatorList } from "./operator_list.js"; import { PDFImage } from "./image.js"; @@ -444,7 +445,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { isInline = false, operatorList, cacheKey, - imageCache, + localImageCache, }) { var dict = image.dict; const imageRef = dict.objId; @@ -491,10 +492,10 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { operatorList.addOp(OPS.paintImageMaskXObject, args); if (cacheKey) { - imageCache[cacheKey] = { + localImageCache.set(cacheKey, imageRef, { fn: OPS.paintImageMaskXObject, args, - }; + }); } return undefined; } @@ -598,10 +599,10 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { operatorList.addOp(OPS.paintImageXObject, args); if (cacheKey) { - imageCache[cacheKey] = { + localImageCache.set(cacheKey, imageRef, { fn: OPS.paintImageXObject, args, - }; + }); if (imageRef) { this.globalImageCache.addPageIndex(imageRef, this.pageIndex); @@ -1201,7 +1202,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { var self = this; var xref = this.xref; let parsingText = false; - var imageCache = Object.create(null); + const localImageCache = new LocalImageCache(); var xobjs = resources.get("XObject") || Dict.empty; var patterns = resources.get("Pattern") || Dict.empty; @@ -1248,10 +1249,13 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { case OPS.paintXObject: // eagerly compile XForm objects var name = args[0].name; - if (name && imageCache[name] !== undefined) { - operatorList.addOp(imageCache[name].fn, imageCache[name].args); - args = null; - continue; + if (name) { + const localImage = localImageCache.getByName(name); + if (localImage) { + operatorList.addOp(localImage.fn, localImage.args); + args = null; + continue; + } } next( @@ -1264,11 +1268,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { let xobj = xobjs.getRaw(name); if (xobj instanceof Ref) { + const localImage = localImageCache.getByRef(xobj); + if (localImage) { + operatorList.addOp(localImage.fn, localImage.args); + + resolveXObject(); + return; + } + const globalImage = self.globalImageCache.getData( xobj, self.pageIndex ); - if (globalImage) { operatorList.addDependency(globalImage.objId); operatorList.addOp(globalImage.fn, globalImage.args); @@ -1276,6 +1287,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { resolveXObject(); return; } + xobj = xref.fetch(xobj); } @@ -1316,7 +1328,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { image: xobj, operatorList, cacheKey: name, - imageCache, + localImageCache, }) .then(resolveXObject, rejectXObject); return; @@ -1375,9 +1387,9 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { case OPS.endInlineImage: var cacheKey = args[0].cacheKey; if (cacheKey) { - var cacheEntry = imageCache[cacheKey]; - if (cacheEntry !== undefined) { - operatorList.addOp(cacheEntry.fn, cacheEntry.args); + const localImage = localImageCache.getByName(cacheKey); + if (localImage) { + operatorList.addOp(localImage.fn, localImage.args); args = null; continue; } @@ -1389,7 +1401,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { isInline: true, operatorList, cacheKey, - imageCache, + localImageCache, }) ); return; diff --git a/src/core/image_utils.js b/src/core/image_utils.js index 0cbdc2c61..ea1090dff 100644 --- a/src/core/image_utils.js +++ b/src/core/image_utils.js @@ -17,6 +17,45 @@ import { assert, info, shadow } from "../shared/util.js"; import { RefSetCache } from "./primitives.js"; +class LocalImageCache { + constructor() { + this._nameRefMap = new Map(); + this._imageMap = new Map(); + this._imageCache = new RefSetCache(); + } + + getByName(name) { + const ref = this._nameRefMap.get(name); + if (ref) { + return this.getByRef(ref); + } + return this._imageMap.get(name) || null; + } + + getByRef(ref) { + return this._imageCache.get(ref) || null; + } + + set(name, ref = null, data) { + if (!name) { + throw new Error('LocalImageCache.set - expected "name" argument.'); + } + if (ref) { + if (this._imageCache.has(ref)) { + return; + } + this._nameRefMap.set(name, ref); + this._imageCache.put(ref, data); + return; + } + // name + if (this._imageMap.has(name)) { + return; + } + this._imageMap.set(name, data); + } +} + class GlobalImageCache { static get NUM_PAGES_THRESHOLD() { return shadow(this, "NUM_PAGES_THRESHOLD", 2); @@ -111,4 +150,4 @@ class GlobalImageCache { } } -export { GlobalImageCache }; +export { LocalImageCache, GlobalImageCache }; From 4ef547f4008b6dc02700768c6e038a64bc2e88f6 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 26 May 2020 09:47:59 +0200 Subject: [PATCH 2/2] Improve caching of empty `/XObject`s in the `PartialEvaluator.getTextContent` method It turns out that `getTextContent` suffers from *similar* problems with repeated images as `getOperatorList`; please see the previous patch. While only `/XObject` resources of the `Form`-type will actually be *parsed* in `PartialEvaluator.getTextContent`, since those are the only ones that may contain text, we're still forced to fetch repeated image resources where the name differs (but not the reference). Obviously it's less bad in this case, since we're not actually parsing `/XObject`s of e.g. the `Image`-type. However, you still want to avoid even fetching the data whenever possible, since `Stream`s are not cached on the `XRef` instance (given their potential size) and the lookup can thus be somewhat expensive in general. To address these issues, we can simply replace the exiting name-only caching in `PartialEvaluator.getTextContent` with a new cache backed by `LocalImageCache` instead. --- src/core/evaluator.js | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 05c287f0e..05e033244 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1724,7 +1724,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd. var xobjs = null; - var skipEmptyXObjs = Object.create(null); + const emptyXObjectCache = new LocalImageCache(); var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager); @@ -2200,7 +2200,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } var name = args[0].name; - if (name && skipEmptyXObjs[name] !== undefined) { + if (name && emptyXObjectCache.getByName(name)) { break; } @@ -2212,7 +2212,16 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { ); } - const xobj = xobjs.get(name); + let xobj = xobjs.getRaw(name); + if (xobj instanceof Ref) { + if (emptyXObjectCache.getByRef(xobj)) { + resolveXObject(); + return; + } + + xobj = xref.fetch(xobj); + } + if (!xobj) { resolveXObject(); return; @@ -2227,7 +2236,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } if (type.name !== "Form") { - skipEmptyXObjs[name] = true; + emptyXObjectCache.set(name, xobj.dict.objId, true); + resolveXObject(); return; } @@ -2278,7 +2288,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }) .then(function () { if (!sinkWrapper.enqueueInvoked) { - skipEmptyXObjs[name] = true; + emptyXObjectCache.set(name, xobj.dict.objId, true); } resolveXObject(); }, rejectXObject);