Export the "raw" toUnicode-data from PartialEvaluator.preEvaluateFont

Compared to other data-structures, such as e.g. `Dict`s, we're purposely *not* caching Streams on the `XRef`-instance.[1]
The, somewhat unfortunate, effect of Streams not being cached is that repeatedly getting the *same* Stream-data requires re-parsing/re-initializing of a bunch of data; see `XRef.fetch` and related methods.

For the font-parsing in particular we're currently fetching the `toUnicode`-data, which is very often a Stream, in `PartialEvaluator.preEvaluateFont` and then *again* in `PartialEvaluator.extractDataStructures` soon afterwards.
By instead letting `PartialEvaluator.preEvaluateFont` export the "raw" `toUnicode`-data, we can avoid *some* unnecessary re-parsing/re-initializing when handling fonts.
*Please note:* In this particular case, given that `PartialEvaluator.preEvaluateFont` only accesses the "raw" `toUnicode` data, exporting a Stream should be safe.

---
[1] The reasons for this include:
 - Streams, especially `DecodeStream`-instances, can become *very* large once read. Hence caching them really isn't a good idea simply because of the (potential) memory impact of doing so.

 - Attempting to read from the *same* Stream-instance more than once won't work, unless it's `reset` in between, since using any method such as e.g. `getBytes` always starts at the current data position.

 - Given that parsing, even in the worker-thread, is now fairly asynchronous it's generally impossible to assert that any one Stream-instance isn't being accessed "concurrently" by e.g. different `getOperatorList` calls. Hence `reset`-ing a cached Stream-instance isn't going to work in the general case.
This commit is contained in:
Jonas Jenwald 2021-05-07 22:25:08 +02:00
parent 13fb1654dc
commit 6eef69de22

View File

@ -2978,10 +2978,9 @@ class PartialEvaluator {
const xref = this.xref; const xref = this.xref;
let cidToGidBytes; let cidToGidBytes;
// 9.10.2 // 9.10.2
const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); const toUnicodePromise = this.readToUnicode(
const toUnicodePromise = toUnicode properties.toUnicode || dict.get("ToUnicode") || baseDict.get("ToUnicode")
? this.readToUnicode(toUnicode) );
: Promise.resolve(undefined);
if (properties.composite) { if (properties.composite) {
// CIDSystemInfo helps to match CID to glyphs // CIDSystemInfo helps to match CID to glyphs
@ -3289,8 +3288,10 @@ class PartialEvaluator {
); );
} }
readToUnicode(toUnicode) { readToUnicode(cmapObj) {
const cmapObj = toUnicode; if (!cmapObj) {
return Promise.resolve(null);
}
if (isName(cmapObj)) { if (isName(cmapObj)) {
return CMapFactory.create({ return CMapFactory.create({
encoding: cmapObj, encoding: cmapObj,
@ -3541,7 +3542,7 @@ class PartialEvaluator {
} }
let composite = false; let composite = false;
let uint8array; let hash, toUnicode;
if (type.name === "Type0") { if (type.name === "Type0") {
// If font is a composite // If font is a composite
// - get the descendant font // - get the descendant font
@ -3566,7 +3567,6 @@ class PartialEvaluator {
const firstChar = dict.get("FirstChar") || 0, const firstChar = dict.get("FirstChar") || 0,
lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff); lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff);
const descriptor = dict.get("FontDescriptor"); const descriptor = dict.get("FontDescriptor");
let hash;
if (descriptor) { if (descriptor) {
hash = new MurmurHash3_64(); hash = new MurmurHash3_64();
@ -3601,10 +3601,10 @@ class PartialEvaluator {
hash.update(`${firstChar}-${lastChar}`); // Fixes issue10665_reduced.pdf hash.update(`${firstChar}-${lastChar}`); // Fixes issue10665_reduced.pdf
const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
if (isStream(toUnicode)) { if (isStream(toUnicode)) {
const stream = toUnicode.str || toUnicode; const stream = toUnicode.str || toUnicode;
uint8array = stream.buffer const uint8array = stream.buffer
? new Uint8Array(stream.buffer.buffer, 0, stream.bufferLength) ? new Uint8Array(stream.buffer.buffer, 0, stream.bufferLength)
: new Uint8Array( : new Uint8Array(
stream.bytes.buffer, stream.bytes.buffer,
@ -3659,18 +3659,22 @@ class PartialEvaluator {
type: type.name, type: type.name,
firstChar, firstChar,
lastChar, lastChar,
toUnicode,
hash: hash ? hash.hexdigest() : "", hash: hash ? hash.hexdigest() : "",
}; };
} }
async translateFont(preEvaluatedFont) { async translateFont({
const baseDict = preEvaluatedFont.baseDict; descriptor,
const dict = preEvaluatedFont.dict; dict,
const composite = preEvaluatedFont.composite; baseDict,
let descriptor = preEvaluatedFont.descriptor; composite,
const type = preEvaluatedFont.type; type,
const firstChar = preEvaluatedFont.firstChar, firstChar,
lastChar = preEvaluatedFont.lastChar; lastChar,
toUnicode,
cssFontInfo,
}) {
let properties; let properties;
if (!descriptor) { if (!descriptor) {
@ -3710,6 +3714,7 @@ class PartialEvaluator {
flags, flags,
firstChar, firstChar,
lastChar, lastChar,
toUnicode,
}; };
const widths = dict.get("Widths"); const widths = dict.get("Widths");
return this.extractDataStructures(dict, dict, properties).then( return this.extractDataStructures(dict, dict, properties).then(
@ -3806,6 +3811,7 @@ class PartialEvaluator {
fontMatrix: dict.getArray("FontMatrix") || FONT_IDENTITY_MATRIX, fontMatrix: dict.getArray("FontMatrix") || FONT_IDENTITY_MATRIX,
firstChar, firstChar,
lastChar, lastChar,
toUnicode,
bbox: descriptor.getArray("FontBBox"), bbox: descriptor.getArray("FontBBox"),
ascent: descriptor.get("Ascent"), ascent: descriptor.get("Ascent"),
descent: descriptor.get("Descent"), descent: descriptor.get("Descent"),
@ -3814,7 +3820,7 @@ class PartialEvaluator {
flags: descriptor.get("Flags"), flags: descriptor.get("Flags"),
italicAngle: descriptor.get("ItalicAngle"), italicAngle: descriptor.get("ItalicAngle"),
isType3Font: false, isType3Font: false,
cssFontInfo: preEvaluatedFont.cssFontInfo, cssFontInfo,
}; };
if (composite) { if (composite) {