Export the "raw" toUnicode-data from PartialEvaluator.preEvaluateFont

Compared to other data-structures, such as e.g. `Dict`s, we're purposely *not* caching Streams on the `XRef`-instance.[1]
The, somewhat unfortunate, effect of Streams not being cached is that repeatedly getting the *same* Stream-data requires re-parsing/re-initializing of a bunch of data; see `XRef.fetch` and related methods.

For the font-parsing in particular we're currently fetching the `toUnicode`-data, which is very often a Stream, in `PartialEvaluator.preEvaluateFont` and then *again* in `PartialEvaluator.extractDataStructures` soon afterwards.
By instead letting `PartialEvaluator.preEvaluateFont` export the "raw" `toUnicode`-data, we can avoid *some* unnecessary re-parsing/re-initializing when handling fonts.
*Please note:* In this particular case, given that `PartialEvaluator.preEvaluateFont` only accesses the "raw" `toUnicode` data, exporting a Stream should be safe.

---
[1] The reasons for this include:
 - Streams, especially `DecodeStream`-instances, can become *very* large once read. Hence caching them really isn't a good idea simply because of the (potential) memory impact of doing so.

 - Attempting to read from the *same* Stream-instance more than once won't work, unless it's `reset` in between, since using any method such as e.g. `getBytes` always starts at the current data position.

 - Given that parsing, even in the worker-thread, is now fairly asynchronous it's generally impossible to assert that any one Stream-instance isn't being accessed "concurrently" by e.g. different `getOperatorList` calls. Hence `reset`-ing a cached Stream-instance isn't going to work in the general case.
This commit is contained in:
Jonas Jenwald 2021-05-07 22:25:08 +02:00
parent 13fb1654dc
commit 6eef69de22

View File

@ -2978,10 +2978,9 @@ class PartialEvaluator {
const xref = this.xref;
let cidToGidBytes;
// 9.10.2
const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
const toUnicodePromise = toUnicode
? this.readToUnicode(toUnicode)
: Promise.resolve(undefined);
const toUnicodePromise = this.readToUnicode(
properties.toUnicode || dict.get("ToUnicode") || baseDict.get("ToUnicode")
);
if (properties.composite) {
// CIDSystemInfo helps to match CID to glyphs
@ -3289,8 +3288,10 @@ class PartialEvaluator {
);
}
readToUnicode(toUnicode) {
const cmapObj = toUnicode;
readToUnicode(cmapObj) {
if (!cmapObj) {
return Promise.resolve(null);
}
if (isName(cmapObj)) {
return CMapFactory.create({
encoding: cmapObj,
@ -3541,7 +3542,7 @@ class PartialEvaluator {
}
let composite = false;
let uint8array;
let hash, toUnicode;
if (type.name === "Type0") {
// If font is a composite
// - get the descendant font
@ -3566,7 +3567,6 @@ class PartialEvaluator {
const firstChar = dict.get("FirstChar") || 0,
lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff);
const descriptor = dict.get("FontDescriptor");
let hash;
if (descriptor) {
hash = new MurmurHash3_64();
@ -3601,10 +3601,10 @@ class PartialEvaluator {
hash.update(`${firstChar}-${lastChar}`); // Fixes issue10665_reduced.pdf
const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
if (isStream(toUnicode)) {
const stream = toUnicode.str || toUnicode;
uint8array = stream.buffer
const uint8array = stream.buffer
? new Uint8Array(stream.buffer.buffer, 0, stream.bufferLength)
: new Uint8Array(
stream.bytes.buffer,
@ -3659,18 +3659,22 @@ class PartialEvaluator {
type: type.name,
firstChar,
lastChar,
toUnicode,
hash: hash ? hash.hexdigest() : "",
};
}
async translateFont(preEvaluatedFont) {
const baseDict = preEvaluatedFont.baseDict;
const dict = preEvaluatedFont.dict;
const composite = preEvaluatedFont.composite;
let descriptor = preEvaluatedFont.descriptor;
const type = preEvaluatedFont.type;
const firstChar = preEvaluatedFont.firstChar,
lastChar = preEvaluatedFont.lastChar;
async translateFont({
descriptor,
dict,
baseDict,
composite,
type,
firstChar,
lastChar,
toUnicode,
cssFontInfo,
}) {
let properties;
if (!descriptor) {
@ -3710,6 +3714,7 @@ class PartialEvaluator {
flags,
firstChar,
lastChar,
toUnicode,
};
const widths = dict.get("Widths");
return this.extractDataStructures(dict, dict, properties).then(
@ -3806,6 +3811,7 @@ class PartialEvaluator {
fontMatrix: dict.getArray("FontMatrix") || FONT_IDENTITY_MATRIX,
firstChar,
lastChar,
toUnicode,
bbox: descriptor.getArray("FontBBox"),
ascent: descriptor.get("Ascent"),
descent: descriptor.get("Descent"),
@ -3814,7 +3820,7 @@ class PartialEvaluator {
flags: descriptor.get("Flags"),
italicAngle: descriptor.get("ItalicAngle"),
isType3Font: false,
cssFontInfo: preEvaluatedFont.cssFontInfo,
cssFontInfo,
};
if (composite) {