From ca719ecaa4b39e08b5ea31e01f82ca6db19a8845 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sun, 28 Jun 2020 13:12:24 +0200 Subject: [PATCH] Add local caching of `Function`s, by reference, in the `PDFFunctionFactory` (issue 2541) Note that compared other structures, such as e.g. Images and ColorSpaces, `Function`s are not referred to by name, which however does bring the advantage of being able to share the cache for an *entire* page. Furthermore, similar to ColorSpaces, the parsing of individual `Function`s are generally fast enough to not really warrant trying to cache them in any "smarter" way than by reference. (Hence trying to do caching similar to e.g. Fonts would most likely be a losing proposition, given the amount of data lookup/parsing that'd be required.) Originally I tried implementing this similar to e.g. the recently added ColorSpace caching (and in a couple of different ways), however it unfortunately turned out to be quite ugly/unwieldy given the sheer number of functions/methods where you'd thus need to pass in a `LocalFunctionCache` instance. (Also, the affected functions/methods didn't exactly have short signatures as-is.) After going back and forth on this for a while it seemed to me that the simplest, or least "invasive" if you will, solution would be if each `PartialEvaluator` instance had its *own* `PDFFunctionFactory` instance (since the latter is already passed to all of the required code). This way each `PDFFunctionFactory` instances could have a local `Function` cache, without it being necessary to provide a `LocalFunctionCache` instance manually at every `PDFFunctionFactory.{create, createFromArray}` call-site. Obviously, with this patch, there's now (potentially) more `PDFFunctionFactory` instances than before when the entire document shared just one. However, each such instance is really quite small and it's also tied to a `PartialEvaluator` instance and those are *not* kept alive and/or cached. To reduce the impact of these changes, I've tried to make as many of these structures as possible *lazily initialized*, specifically: - The `PDFFunctionFactory`, on `PartialEvaluator` instances, since not all kinds of general parsing actually requires it. For example: `getTextContent` calls won't cause any `Function` to be parsed, and even some `getOperatorList` calls won't trigger `Function` parsing (if a page contains e.g. no Patterns or "complex" ColorSpaces). - The `LocalFunctionCache`, on `PDFFunctionFactory` instances, since only certain parsing requires it. Generally speaking, only e.g. Patterns, "complex" ColorSpaces, and/or (some) SoftMasks will trigger any `Function` parsing. To put these changes into perspective, when loading/rendering all (14) pages of the default `tracemonkey.pdf` file there's now a total of 6 `PDFFunctionFactory` and 1 `LocalFunctionCache` instances created thanks to the lazy initialization. (If you instead would keep the document-"global" `PDFFunctionFactory` instance and pass around `LocalFunctionCache` instances everywhere, the numbers for the `tracemonkey.pdf` file would be instead be something like 1 `PDFFunctionFactory` and 6 `LocalFunctionCache` instances.) All-in-all, I thus don't think that the `PDFFunctionFactory` changes should be generally problematic. With these changes, we can also modify (some) call-sites to pass in a `Reference` rather than the actual `Function` data. This is nice since `Function`s can also be `Streams`, which are not cached on the `XRef` instance (given their potential size), and this way we can avoid unnecessary lookups and thus save some additional time/resources. Obviously I had intended to include (standard) benchmark results with these changes, but for reasons I don't really understand the test run-time (even with `master`) of the document in issue 2541 is quite a bit slower than in the development viewer. However, logging the time it takes for the relevant `PDFFunctionFactory`/`PDFFunction ` parsing shows that it takes *approximately* `0.5 ms` for the `Function` in question. Looking up a cached `Function`, on the other hand, is *one order of magnitude faster* which does add up when the same `Function` is invoked close to 2000 times. --- src/core/colorspace.js | 2 +- src/core/document.js | 11 ------ src/core/evaluator.js | 29 +++++++++++----- src/core/function.js | 76 ++++++++++++++++++++++++++++++++++++++--- src/core/image_utils.js | 23 ++++++++++++- src/core/pattern.js | 4 +-- 6 files changed, 116 insertions(+), 29 deletions(-) diff --git a/src/core/colorspace.js b/src/core/colorspace.js index d1597a730..c65cb15a5 100644 --- a/src/core/colorspace.js +++ b/src/core/colorspace.js @@ -535,7 +535,7 @@ class ColorSpace { const name = xref.fetchIfRef(cs[1]); numComps = Array.isArray(name) ? name.length : 1; alt = this.parseToIR(cs[2], xref, resources, pdfFunctionFactory); - const tintFn = pdfFunctionFactory.create(xref.fetchIfRef(cs[3])); + const tintFn = pdfFunctionFactory.create(cs[3]); return ["AlternateCS", numComps, alt, tintFn]; case "Lab": params = xref.fetchIfRef(cs[1]); diff --git a/src/core/document.js b/src/core/document.js index dd456bbfd..c0d343f05 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -53,7 +53,6 @@ import { calculateMD5 } from "./crypto.js"; import { Linearization } from "./parser.js"; import { OperatorList } from "./operator_list.js"; import { PartialEvaluator } from "./evaluator.js"; -import { PDFFunctionFactory } from "./function.js"; const DEFAULT_USER_UNIT = 1.0; const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792]; @@ -75,7 +74,6 @@ class Page { fontCache, builtInCMapCache, globalImageCache, - pdfFunctionFactory, }) { this.pdfManager = pdfManager; this.pageIndex = pageIndex; @@ -85,7 +83,6 @@ class Page { this.fontCache = fontCache; this.builtInCMapCache = builtInCMapCache; this.globalImageCache = globalImageCache; - this.pdfFunctionFactory = pdfFunctionFactory; this.evaluatorOptions = pdfManager.evaluatorOptions; this.resourcesPromise = null; @@ -265,7 +262,6 @@ class Page { builtInCMapCache: this.builtInCMapCache, globalImageCache: this.globalImageCache, options: this.evaluatorOptions, - pdfFunctionFactory: this.pdfFunctionFactory, }); const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]); @@ -359,7 +355,6 @@ class Page { builtInCMapCache: this.builtInCMapCache, globalImageCache: this.globalImageCache, options: this.evaluatorOptions, - pdfFunctionFactory: this.pdfFunctionFactory, }); return partialEvaluator.getTextContent({ @@ -508,11 +503,6 @@ class PDFDocument { this.pdfManager = pdfManager; this.stream = stream; this.xref = new XRef(stream, pdfManager); - - this.pdfFunctionFactory = new PDFFunctionFactory({ - xref: this.xref, - isEvalSupported: pdfManager.evaluatorOptions.isEvalSupported, - }); this._pagePromises = []; } @@ -821,7 +811,6 @@ class PDFDocument { fontCache: catalog.fontCache, builtInCMapCache: catalog.builtInCMapCache, globalImageCache: catalog.globalImageCache, - pdfFunctionFactory: this.pdfFunctionFactory, }); })); } diff --git a/src/core/evaluator.js b/src/core/evaluator.js index d11804ba7..0db8e4cc6 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -26,6 +26,7 @@ import { isNum, isString, OPS, + shadow, stringToPDFString, TextRenderingMode, UNSUPPORTED_FEATURES, @@ -72,6 +73,7 @@ import { getSymbolsFonts, } from "./standard_fonts.js"; import { getTilingPatternIR, Pattern } from "./pattern.js"; +import { isPDFFunction, PDFFunctionFactory } from "./function.js"; import { Lexer, Parser } from "./parser.js"; import { LocalColorSpaceCache, LocalImageCache } from "./image_utils.js"; import { bidi } from "./bidi.js"; @@ -79,7 +81,6 @@ import { ColorSpace } from "./colorspace.js"; import { DecodeStream } from "./stream.js"; import { getGlyphsUnicode } from "./glyphlist.js"; import { getMetrics } from "./metrics.js"; -import { isPDFFunction } from "./function.js"; import { MurmurHash3_64 } from "./murmurhash3.js"; import { OperatorList } from "./operator_list.js"; import { PDFImage } from "./image.js"; @@ -103,7 +104,6 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { builtInCMapCache, globalImageCache, options = null, - pdfFunctionFactory, }) { this.xref = xref; this.handler = handler; @@ -113,7 +113,6 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { this.builtInCMapCache = builtInCMapCache; this.globalImageCache = globalImageCache; this.options = options || DefaultPartialEvaluatorOptions; - this.pdfFunctionFactory = pdfFunctionFactory; this.parsingType3Font = false; this._fetchBuiltInCMapBound = this.fetchBuiltInCMap.bind(this); @@ -207,6 +206,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { SHADING_PATTERN = 2; PartialEvaluator.prototype = { + /** + * Since Functions are only cached (locally) by reference, we can share one + * `PDFFunctionFactory` instance within this `PartialEvaluator` instance. + */ + get _pdfFunctionFactory() { + const pdfFunctionFactory = new PDFFunctionFactory({ + xref: this.xref, + isEvalSupported: this.options.isEvalSupported, + }); + return shadow(this, "_pdfFunctionFactory", pdfFunctionFactory); + }, + clone(newOptions = DefaultPartialEvaluatorOptions) { var newEvaluator = Object.create(this); newEvaluator.options = newOptions; @@ -552,7 +563,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { res: resources, image, isInline, - pdfFunctionFactory: this.pdfFunctionFactory, + pdfFunctionFactory: this._pdfFunctionFactory, localColorSpaceCache, }); // We force the use of RGBA_32BPP images here, because we can't handle @@ -589,7 +600,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { res: resources, image, isInline, - pdfFunctionFactory: this.pdfFunctionFactory, + pdfFunctionFactory: this._pdfFunctionFactory, localColorSpaceCache, }) .then(imageObj => { @@ -651,7 +662,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // we will build a map of integer values in range 0..255 to be fast. var transferObj = smask.get("TR"); if (isPDFFunction(transferObj)) { - const transferFn = this.pdfFunctionFactory.create(transferObj); + const transferFn = this._pdfFunctionFactory.create(transferObj); var transferMap = new Uint8Array(256); var tmp = new Float32Array(1); for (var i = 0; i < 256; i++) { @@ -1145,7 +1156,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { cs, xref: this.xref, resources, - pdfFunctionFactory: this.pdfFunctionFactory, + pdfFunctionFactory: this._pdfFunctionFactory, localColorSpaceCache, }).catch(reason => { if (reason instanceof AbortException) { @@ -1202,7 +1213,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { this.xref, resources, this.handler, - this.pdfFunctionFactory, + this._pdfFunctionFactory, localColorSpaceCache ); operatorList.addOp(fn, pattern.getIR()); @@ -1641,7 +1652,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { xref, resources, self.handler, - self.pdfFunctionFactory, + self._pdfFunctionFactory, localColorSpaceCache ); var patternIR = shadingFill.getIR(); diff --git a/src/core/function.js b/src/core/function.js index e6f264a2c..0b9a7fad6 100644 --- a/src/core/function.js +++ b/src/core/function.js @@ -13,6 +13,7 @@ * limitations under the License. */ +import { Dict, isDict, isStream, Ref } from "./primitives.js"; import { FormatError, info, @@ -20,29 +21,94 @@ import { IsEvalSupportedCached, unreachable, } from "../shared/util.js"; -import { isDict, isStream } from "./primitives.js"; import { PostScriptLexer, PostScriptParser } from "./ps_parser.js"; +import { LocalFunctionCache } from "./image_utils.js"; class PDFFunctionFactory { constructor({ xref, isEvalSupported = true }) { this.xref = xref; this.isEvalSupported = isEvalSupported !== false; + this._localFunctionCache = null; // Initialized lazily. } create(fn) { - return PDFFunction.parse({ + const cachedFunction = this.getCached(fn); + if (cachedFunction) { + return cachedFunction; + } + const parsedFunction = PDFFunction.parse({ xref: this.xref, isEvalSupported: this.isEvalSupported, - fn, + fn: fn instanceof Ref ? this.xref.fetch(fn) : fn, }); + + // Attempt to cache the parsed Function, by reference. + this._cache(fn, parsedFunction); + + return parsedFunction; } createFromArray(fnObj) { - return PDFFunction.parseArray({ + const cachedFunction = this.getCached(fnObj); + if (cachedFunction) { + return cachedFunction; + } + const parsedFunction = PDFFunction.parseArray({ xref: this.xref, isEvalSupported: this.isEvalSupported, - fnObj, + fnObj: fnObj instanceof Ref ? this.xref.fetch(fnObj) : fnObj, }); + + // Attempt to cache the parsed Function, by reference. + this._cache(fnObj, parsedFunction); + + return parsedFunction; + } + + getCached(cacheKey) { + let fnRef; + if (cacheKey instanceof Ref) { + fnRef = cacheKey; + } else if (cacheKey instanceof Dict) { + fnRef = cacheKey.objId; + } else if (isStream(cacheKey)) { + fnRef = cacheKey.dict && cacheKey.dict.objId; + } + if (fnRef) { + if (!this._localFunctionCache) { + this._localFunctionCache = new LocalFunctionCache(); + } + const localFunction = this._localFunctionCache.getByRef(fnRef); + if (localFunction) { + return localFunction; + } + } + return null; + } + + /** + * @private + */ + _cache(cacheKey, parsedFunction) { + if (!parsedFunction) { + throw new Error( + 'PDFFunctionFactory._cache - expected "parsedFunction" argument.' + ); + } + let fnRef; + if (cacheKey instanceof Ref) { + fnRef = cacheKey; + } else if (cacheKey instanceof Dict) { + fnRef = cacheKey.objId; + } else if (isStream(cacheKey)) { + fnRef = cacheKey.dict && cacheKey.dict.objId; + } + if (fnRef) { + if (!this._localFunctionCache) { + this._localFunctionCache = new LocalFunctionCache(); + } + this._localFunctionCache.set(/* name = */ null, fnRef, parsedFunction); + } } } diff --git a/src/core/image_utils.js b/src/core/image_utils.js index d255cc098..c42c5d8e4 100644 --- a/src/core/image_utils.js +++ b/src/core/image_utils.js @@ -91,6 +91,22 @@ class LocalColorSpaceCache extends BaseLocalCache { } } +class LocalFunctionCache extends BaseLocalCache { + getByName(name) { + unreachable("Should not call `getByName` method."); + } + + set(name = null, ref, data) { + if (!ref) { + throw new Error('LocalFunctionCache.set - expected "ref" argument.'); + } + if (this._imageCache.has(ref)) { + return; + } + this._imageCache.put(ref, data); + } +} + class GlobalImageCache { static get NUM_PAGES_THRESHOLD() { return shadow(this, "NUM_PAGES_THRESHOLD", 2); @@ -184,4 +200,9 @@ class GlobalImageCache { } } -export { LocalImageCache, LocalColorSpaceCache, GlobalImageCache }; +export { + LocalImageCache, + LocalColorSpaceCache, + LocalFunctionCache, + GlobalImageCache, +}; diff --git a/src/core/pattern.js b/src/core/pattern.js index e1f63b513..c3daefb72 100644 --- a/src/core/pattern.js +++ b/src/core/pattern.js @@ -178,7 +178,7 @@ Shadings.RadialAxial = (function RadialAxialClosure() { this.extendStart = extendStart; this.extendEnd = extendEnd; - var fnObj = dict.get("Function"); + var fnObj = dict.getRaw("Function"); var fn = pdfFunctionFactory.createFromArray(fnObj); // 10 samples seems good enough for now, but probably won't work @@ -878,7 +878,7 @@ Shadings.Mesh = (function MeshClosure() { ? cs.getRgb(dict.get("Background"), 0) : null; - var fnObj = dict.get("Function"); + var fnObj = dict.getRaw("Function"); var fn = fnObj ? pdfFunctionFactory.createFromArray(fnObj) : null; this.coords = [];