From 13fb1654dc20f814f3cfdc498371b2f438aa884c Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 7 May 2021 22:07:23 +0200 Subject: [PATCH 1/2] Export the `firstChar`/`lastChar`-data from `PartialEvaluator.preEvaluateFont` Rather than re-fetching/re-parsing these properties immediately in `PartialEvaluator.translateFont`, we can simply export them instead. (Obviously the effect will be really tiny, but there is less parsing overall this way.) --- src/core/evaluator.js | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 7202e2052..d5476347d 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3563,10 +3563,13 @@ class PartialEvaluator { composite = true; } + const firstChar = dict.get("FirstChar") || 0, + lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff); const descriptor = dict.get("FontDescriptor"); let hash; if (descriptor) { hash = new MurmurHash3_64(); + const encoding = baseDict.getRaw("Encoding"); if (isName(encoding)) { hash.update(encoding.name); @@ -3596,9 +3599,7 @@ class PartialEvaluator { } } - const firstChar = dict.get("FirstChar") || 0; - const lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff); - hash.update(`${firstChar}-${lastChar}`); + hash.update(`${firstChar}-${lastChar}`); // Fixes issue10665_reduced.pdf const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); if (isStream(toUnicode)) { @@ -3656,6 +3657,8 @@ class PartialEvaluator { baseDict, composite, type: type.name, + firstChar, + lastChar, hash: hash ? hash.hexdigest() : "", }; } @@ -3666,10 +3669,9 @@ class PartialEvaluator { const composite = preEvaluatedFont.composite; let descriptor = preEvaluatedFont.descriptor; const type = preEvaluatedFont.type; - const maxCharIndex = composite ? 0xffff : 0xff; + const firstChar = preEvaluatedFont.firstChar, + lastChar = preEvaluatedFont.lastChar; let properties; - const firstChar = dict.get("FirstChar") || 0; - const lastChar = dict.get("LastChar") || maxCharIndex; if (!descriptor) { if (type === "Type3") { @@ -3802,8 +3804,8 @@ class PartialEvaluator { composite, fixedPitch: false, fontMatrix: dict.getArray("FontMatrix") || FONT_IDENTITY_MATRIX, - firstChar: firstChar || 0, - lastChar: lastChar || maxCharIndex, + firstChar, + lastChar, bbox: descriptor.getArray("FontBBox"), ascent: descriptor.get("Ascent"), descent: descriptor.get("Descent"), From 6eef69de2261aa6e34987ec6b6707985de3cf960 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 7 May 2021 22:25:08 +0200 Subject: [PATCH 2/2] Export the "raw" `toUnicode`-data from `PartialEvaluator.preEvaluateFont` Compared to other data-structures, such as e.g. `Dict`s, we're purposely *not* caching Streams on the `XRef`-instance.[1] The, somewhat unfortunate, effect of Streams not being cached is that repeatedly getting the *same* Stream-data requires re-parsing/re-initializing of a bunch of data; see `XRef.fetch` and related methods. For the font-parsing in particular we're currently fetching the `toUnicode`-data, which is very often a Stream, in `PartialEvaluator.preEvaluateFont` and then *again* in `PartialEvaluator.extractDataStructures` soon afterwards. By instead letting `PartialEvaluator.preEvaluateFont` export the "raw" `toUnicode`-data, we can avoid *some* unnecessary re-parsing/re-initializing when handling fonts. *Please note:* In this particular case, given that `PartialEvaluator.preEvaluateFont` only accesses the "raw" `toUnicode` data, exporting a Stream should be safe. --- [1] The reasons for this include: - Streams, especially `DecodeStream`-instances, can become *very* large once read. Hence caching them really isn't a good idea simply because of the (potential) memory impact of doing so. - Attempting to read from the *same* Stream-instance more than once won't work, unless it's `reset` in between, since using any method such as e.g. `getBytes` always starts at the current data position. - Given that parsing, even in the worker-thread, is now fairly asynchronous it's generally impossible to assert that any one Stream-instance isn't being accessed "concurrently" by e.g. different `getOperatorList` calls. Hence `reset`-ing a cached Stream-instance isn't going to work in the general case. --- src/core/evaluator.js | 44 ++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index d5476347d..f38678ccf 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2978,10 +2978,9 @@ class PartialEvaluator { const xref = this.xref; let cidToGidBytes; // 9.10.2 - const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); - const toUnicodePromise = toUnicode - ? this.readToUnicode(toUnicode) - : Promise.resolve(undefined); + const toUnicodePromise = this.readToUnicode( + properties.toUnicode || dict.get("ToUnicode") || baseDict.get("ToUnicode") + ); if (properties.composite) { // CIDSystemInfo helps to match CID to glyphs @@ -3289,8 +3288,10 @@ class PartialEvaluator { ); } - readToUnicode(toUnicode) { - const cmapObj = toUnicode; + readToUnicode(cmapObj) { + if (!cmapObj) { + return Promise.resolve(null); + } if (isName(cmapObj)) { return CMapFactory.create({ encoding: cmapObj, @@ -3541,7 +3542,7 @@ class PartialEvaluator { } let composite = false; - let uint8array; + let hash, toUnicode; if (type.name === "Type0") { // If font is a composite // - get the descendant font @@ -3566,7 +3567,6 @@ class PartialEvaluator { const firstChar = dict.get("FirstChar") || 0, lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff); const descriptor = dict.get("FontDescriptor"); - let hash; if (descriptor) { hash = new MurmurHash3_64(); @@ -3601,10 +3601,10 @@ class PartialEvaluator { hash.update(`${firstChar}-${lastChar}`); // Fixes issue10665_reduced.pdf - const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); + toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); if (isStream(toUnicode)) { const stream = toUnicode.str || toUnicode; - uint8array = stream.buffer + const uint8array = stream.buffer ? new Uint8Array(stream.buffer.buffer, 0, stream.bufferLength) : new Uint8Array( stream.bytes.buffer, @@ -3659,18 +3659,22 @@ class PartialEvaluator { type: type.name, firstChar, lastChar, + toUnicode, hash: hash ? hash.hexdigest() : "", }; } - async translateFont(preEvaluatedFont) { - const baseDict = preEvaluatedFont.baseDict; - const dict = preEvaluatedFont.dict; - const composite = preEvaluatedFont.composite; - let descriptor = preEvaluatedFont.descriptor; - const type = preEvaluatedFont.type; - const firstChar = preEvaluatedFont.firstChar, - lastChar = preEvaluatedFont.lastChar; + async translateFont({ + descriptor, + dict, + baseDict, + composite, + type, + firstChar, + lastChar, + toUnicode, + cssFontInfo, + }) { let properties; if (!descriptor) { @@ -3710,6 +3714,7 @@ class PartialEvaluator { flags, firstChar, lastChar, + toUnicode, }; const widths = dict.get("Widths"); return this.extractDataStructures(dict, dict, properties).then( @@ -3806,6 +3811,7 @@ class PartialEvaluator { fontMatrix: dict.getArray("FontMatrix") || FONT_IDENTITY_MATRIX, firstChar, lastChar, + toUnicode, bbox: descriptor.getArray("FontBBox"), ascent: descriptor.get("Ascent"), descent: descriptor.get("Descent"), @@ -3814,7 +3820,7 @@ class PartialEvaluator { flags: descriptor.get("Flags"), italicAngle: descriptor.get("ItalicAngle"), isType3Font: false, - cssFontInfo: preEvaluatedFont.cssFontInfo, + cssFontInfo, }; if (composite) {