From 3660aaac85e6be571c238e969d9cf5ca598e4215 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Mon, 17 May 2021 14:34:08 +0200 Subject: [PATCH 1/4] Tweak `adjustToUnicode` to allow extending a built-in /ToUnicode map *This is somewhat similiar to the recent changes, in PR 13277, for fonts with an /Encoding entry.* Currently we're *completely* ignoring the `builtInEncoding`, from the font data itself, for fonts which have a built-in /ToUnicode map. While it (obviously) doesn't seem like a good idea in general to simply overwrite existing built-in /ToUnicode entries, it should however not hurt to use the `builtInEncoding` to supplement *missing* /ToUnicode entires. --- src/core/fonts.js | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/core/fonts.js b/src/core/fonts.js index 8b8ec434d..b391ab693 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -135,9 +135,6 @@ function adjustToUnicode(properties, builtInEncoding) { if (properties.isInternalFont) { return; } - if (properties.hasIncludedToUnicodeMap) { - return; // The font dictionary has a `ToUnicode` entry. - } if (builtInEncoding === properties.defaultEncoding) { return; // No point in trying to adjust `toUnicode` if the encodings match. } @@ -147,11 +144,17 @@ function adjustToUnicode(properties, builtInEncoding) { const toUnicode = [], glyphsUnicodeMap = getGlyphsUnicode(); for (const charCode in builtInEncoding) { - if ( - properties.hasEncoding && - properties.differences[charCode] !== undefined - ) { - continue; // The font dictionary has an `Encoding`/`Differences` entry. + if (properties.hasIncludedToUnicodeMap) { + if (properties.toUnicode.has(charCode)) { + continue; // The font dictionary has a `ToUnicode` entry. + } + } else { + if ( + properties.hasEncoding && + properties.differences[charCode] !== undefined + ) { + continue; // The font dictionary has an `Encoding`/`Differences` entry. + } } const glyphName = builtInEncoding[charCode]; const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); @@ -159,7 +162,9 @@ function adjustToUnicode(properties, builtInEncoding) { toUnicode[charCode] = String.fromCharCode(unicode); } } - properties.toUnicode.amend(toUnicode); + if (toUnicode.length > 0) { + properties.toUnicode.amend(toUnicode); + } } class Glyph { From edc38de37ac7a1ba659b59598a254c38ac1fcf33 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Mon, 17 May 2021 15:40:23 +0200 Subject: [PATCH 2/4] Convert `PartialEvaluator.buildToUnicode` to an `async` method This removes the need to *manually* wrap all return values in a Promise. --- src/core/evaluator.js | 50 +++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 642e6aa47..4ef6f6e57 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3284,7 +3284,7 @@ class PartialEvaluator { * @returns {Promise} A Promise that is resolved with a * {ToUnicodeMap|IdentityToUnicodeMap} object. */ - buildToUnicode(properties) { + async buildToUnicode(properties) { properties.hasIncludedToUnicodeMap = !!properties.toUnicode && properties.toUnicode.length > 0; @@ -3297,8 +3297,7 @@ class PartialEvaluator { properties.fallbackToUnicode = this._buildSimpleFontToUnicode(properties); } - - return Promise.resolve(properties.toUnicode); + return properties.toUnicode; } // According to the spec if the font is a simple font we should only map @@ -3307,7 +3306,7 @@ class PartialEvaluator { // in pratice it seems better to always try to create a toUnicode map // based of the default encoding. if (!properties.composite /* is simple font */) { - return Promise.resolve(this._buildSimpleFontToUnicode(properties)); + return this._buildSimpleFontToUnicode(properties); } // If the font is a composite font that uses one of the predefined CMaps @@ -3330,42 +3329,37 @@ class PartialEvaluator { // b) Obtain the registry and ordering of the character collection used // by the font’s CMap (for example, Adobe and Japan1) from its // CIDSystemInfo dictionary. - const registry = properties.cidSystemInfo.registry; - const ordering = properties.cidSystemInfo.ordering; + const { registry, ordering } = properties.cidSystemInfo; // c) Construct a second CMap name by concatenating the registry and // ordering obtained in step (b) in the format registry–ordering–UCS2 // (for example, Adobe–Japan1–UCS2). - const ucs2CMapName = Name.get(registry + "-" + ordering + "-UCS2"); + const ucs2CMapName = Name.get(`${registry}-${ordering}-UCS2`); // d) Obtain the CMap with the name constructed in step (c) (available // from the ASN Web site; see the Bibliography). - return CMapFactory.create({ + const ucs2CMap = await CMapFactory.create({ encoding: ucs2CMapName, fetchBuiltInCMap: this._fetchBuiltInCMapBound, useCMap: null, - }).then(function (ucs2CMap) { - const cMap = properties.cMap; - const toUnicode = []; - cMap.forEach(function (charcode, cid) { - if (cid > 0xffff) { - throw new FormatError("Max size of CID is 65,535"); - } - // e) Map the CID obtained in step (a) according to the CMap - // obtained in step (d), producing a Unicode value. - const ucs2 = ucs2CMap.lookup(cid); - if (ucs2) { - toUnicode[charcode] = String.fromCharCode( - (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1) - ); - } - }); - return new ToUnicodeMap(toUnicode); }); + const toUnicode = []; + properties.cMap.forEach(function (charcode, cid) { + if (cid > 0xffff) { + throw new FormatError("Max size of CID is 65,535"); + } + // e) Map the CID obtained in step (a) according to the CMap + // obtained in step (d), producing a Unicode value. + const ucs2 = ucs2CMap.lookup(cid); + if (ucs2) { + toUnicode[charcode] = String.fromCharCode( + (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1) + ); + } + }); + return new ToUnicodeMap(toUnicode); } // The viewer's choice, just use an identity map. - return Promise.resolve( - new IdentityToUnicodeMap(properties.firstChar, properties.lastChar) - ); + return new IdentityToUnicodeMap(properties.firstChar, properties.lastChar); } readToUnicode(cmapObj) { From 7190bc23a856b12b06e91d78eb4564e9a0a9a775 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 18 May 2021 09:04:14 +0200 Subject: [PATCH 3/4] Remove unnecessary `in` checks of Arrays, when building the `charCodeToGlyphId` for TrueType fonts Note that all standard Encodings have the same length (i.e. `256` elements) and that missing entries are always represented by empty strings, hence why a separate exists-check isn't necessary in the `baseEncoding` case. --- src/core/fonts.js | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/core/fonts.js b/src/core/fonts.js index b391ab693..6fb746197 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -2550,12 +2550,9 @@ class Font { const glyphsUnicodeMap = getGlyphsUnicode(); for (let charCode = 0; charCode < 256; charCode++) { let glyphName; - if (this.differences && charCode in this.differences) { + if (this.differences[charCode] !== undefined) { glyphName = this.differences[charCode]; - } else if ( - charCode in baseEncoding && - baseEncoding[charCode] !== "" - ) { + } else if (baseEncoding[charCode] !== "") { glyphName = baseEncoding[charCode]; } else { glyphName = StandardEncoding[charCode]; From 229a49b9b902768ed19cdafad18ad609b341d15f Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 18 May 2021 13:45:19 +0200 Subject: [PATCH 4/4] Re-factor the `fallbackToUnicode` functionality (PR 9192 follow-up) Rather than having to create and check a *separate* `ToUnicodeMap` to handle these cases, we can simply use the `fallbackToUnicode`-data (when it exists) to directly supplement *missing* /ToUnicode entires in the regular `ToUnicodeMap` instead. --- src/core/evaluator.js | 13 ++++++------- src/core/fonts.js | 35 ++++++++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 4ef6f6e57..2eb9847c9 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3178,10 +3178,10 @@ class PartialEvaluator { } /** - * @returns {ToUnicodeMap} + * @returns {Array} * @private */ - _buildSimpleFontToUnicode(properties, forceGlyphs = false) { + _simpleFontToUnicode(properties, forceGlyphs = false) { assert(!properties.composite, "Must be a simple font."); const toUnicode = []; @@ -3242,7 +3242,7 @@ class PartialEvaluator { Number.isNaN(code) && Number.isInteger(parseInt(codeStr, 16)) ) { - return this._buildSimpleFontToUnicode( + return this._simpleFontToUnicode( properties, /* forceGlyphs */ true ); @@ -3275,7 +3275,7 @@ class PartialEvaluator { } toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]); } - return new ToUnicodeMap(toUnicode); + return toUnicode; } /** @@ -3294,8 +3294,7 @@ class PartialEvaluator { // text-extraction. For simple fonts, containing encoding information, // use a fallback ToUnicode map to improve this (fixes issue8229.pdf). if (!properties.composite && properties.hasEncoding) { - properties.fallbackToUnicode = - this._buildSimpleFontToUnicode(properties); + properties.fallbackToUnicode = this._simpleFontToUnicode(properties); } return properties.toUnicode; } @@ -3306,7 +3305,7 @@ class PartialEvaluator { // in pratice it seems better to always try to create a toUnicode map // based of the default encoding. if (!properties.composite /* is simple font */) { - return this._buildSimpleFontToUnicode(properties); + return new ToUnicodeMap(this._simpleFontToUnicode(properties)); } // If the font is a composite font that uses one of the predefined CMaps diff --git a/src/core/fonts.js b/src/core/fonts.js index 6fb746197..370035ffe 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -167,6 +167,29 @@ function adjustToUnicode(properties, builtInEncoding) { } } +/** + * NOTE: This function should only be called at the *end* of font-parsing, + * after e.g. `adjustToUnicode` has run, to prevent any issues. + */ +function amendFallbackToUnicode(properties) { + if (!properties.fallbackToUnicode) { + return; + } + if (properties.toUnicode instanceof IdentityToUnicodeMap) { + return; + } + const toUnicode = []; + for (const charCode in properties.fallbackToUnicode) { + if (properties.toUnicode.has(charCode)) { + continue; // The font dictionary has a `ToUnicode` entry. + } + toUnicode[charCode] = properties.fallbackToUnicode[charCode]; + } + if (toUnicode.length > 0) { + properties.toUnicode.amend(toUnicode); + } +} + class Glyph { constructor( originalCharCode, @@ -854,8 +877,6 @@ class Font { this.defaultEncoding = properties.defaultEncoding; this.toUnicode = properties.toUnicode; - this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap(); - this.toFontChar = []; if (properties.type === "Type3") { @@ -941,6 +962,7 @@ class Font { return; } + amendFallbackToUnicode(properties); this.data = data; this.fontType = getFontType(type, subtype, properties.isStandardFont); @@ -1099,6 +1121,8 @@ class Font { } this.toFontChar = map; } + + amendFallbackToUnicode(properties); this.loadedName = fontName.split("-")[0]; this.fontType = getFontType(type, subtype, properties.isStandardFont); } @@ -2957,15 +2981,12 @@ class Font { width = isNum(width) ? width : this.defaultWidth; const vmetric = this.vmetrics && this.vmetrics[widthCode]; - let unicode = - this.toUnicode.get(charcode) || - this.fallbackToUnicode.get(charcode) || - charcode; + let unicode = this.toUnicode.get(charcode) || charcode; if (typeof unicode === "number") { unicode = String.fromCharCode(unicode); } - let isInFont = charcode in this.toFontChar; + let isInFont = this.toFontChar[charcode] !== undefined; // First try the toFontChar map, if it's not there then try falling // back to the char code. fontCharCode = this.toFontChar[charcode] || charcode;