From 229a49b9b902768ed19cdafad18ad609b341d15f Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 18 May 2021 13:45:19 +0200 Subject: [PATCH] Re-factor the `fallbackToUnicode` functionality (PR 9192 follow-up) Rather than having to create and check a *separate* `ToUnicodeMap` to handle these cases, we can simply use the `fallbackToUnicode`-data (when it exists) to directly supplement *missing* /ToUnicode entires in the regular `ToUnicodeMap` instead. --- src/core/evaluator.js | 13 ++++++------- src/core/fonts.js | 35 ++++++++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 4ef6f6e57..2eb9847c9 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3178,10 +3178,10 @@ class PartialEvaluator { } /** - * @returns {ToUnicodeMap} + * @returns {Array} * @private */ - _buildSimpleFontToUnicode(properties, forceGlyphs = false) { + _simpleFontToUnicode(properties, forceGlyphs = false) { assert(!properties.composite, "Must be a simple font."); const toUnicode = []; @@ -3242,7 +3242,7 @@ class PartialEvaluator { Number.isNaN(code) && Number.isInteger(parseInt(codeStr, 16)) ) { - return this._buildSimpleFontToUnicode( + return this._simpleFontToUnicode( properties, /* forceGlyphs */ true ); @@ -3275,7 +3275,7 @@ class PartialEvaluator { } toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]); } - return new ToUnicodeMap(toUnicode); + return toUnicode; } /** @@ -3294,8 +3294,7 @@ class PartialEvaluator { // text-extraction. For simple fonts, containing encoding information, // use a fallback ToUnicode map to improve this (fixes issue8229.pdf). if (!properties.composite && properties.hasEncoding) { - properties.fallbackToUnicode = - this._buildSimpleFontToUnicode(properties); + properties.fallbackToUnicode = this._simpleFontToUnicode(properties); } return properties.toUnicode; } @@ -3306,7 +3305,7 @@ class PartialEvaluator { // in pratice it seems better to always try to create a toUnicode map // based of the default encoding. if (!properties.composite /* is simple font */) { - return this._buildSimpleFontToUnicode(properties); + return new ToUnicodeMap(this._simpleFontToUnicode(properties)); } // If the font is a composite font that uses one of the predefined CMaps diff --git a/src/core/fonts.js b/src/core/fonts.js index 6fb746197..370035ffe 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -167,6 +167,29 @@ function adjustToUnicode(properties, builtInEncoding) { } } +/** + * NOTE: This function should only be called at the *end* of font-parsing, + * after e.g. `adjustToUnicode` has run, to prevent any issues. + */ +function amendFallbackToUnicode(properties) { + if (!properties.fallbackToUnicode) { + return; + } + if (properties.toUnicode instanceof IdentityToUnicodeMap) { + return; + } + const toUnicode = []; + for (const charCode in properties.fallbackToUnicode) { + if (properties.toUnicode.has(charCode)) { + continue; // The font dictionary has a `ToUnicode` entry. + } + toUnicode[charCode] = properties.fallbackToUnicode[charCode]; + } + if (toUnicode.length > 0) { + properties.toUnicode.amend(toUnicode); + } +} + class Glyph { constructor( originalCharCode, @@ -854,8 +877,6 @@ class Font { this.defaultEncoding = properties.defaultEncoding; this.toUnicode = properties.toUnicode; - this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap(); - this.toFontChar = []; if (properties.type === "Type3") { @@ -941,6 +962,7 @@ class Font { return; } + amendFallbackToUnicode(properties); this.data = data; this.fontType = getFontType(type, subtype, properties.isStandardFont); @@ -1099,6 +1121,8 @@ class Font { } this.toFontChar = map; } + + amendFallbackToUnicode(properties); this.loadedName = fontName.split("-")[0]; this.fontType = getFontType(type, subtype, properties.isStandardFont); } @@ -2957,15 +2981,12 @@ class Font { width = isNum(width) ? width : this.defaultWidth; const vmetric = this.vmetrics && this.vmetrics[widthCode]; - let unicode = - this.toUnicode.get(charcode) || - this.fallbackToUnicode.get(charcode) || - charcode; + let unicode = this.toUnicode.get(charcode) || charcode; if (typeof unicode === "number") { unicode = String.fromCharCode(unicode); } - let isInFont = charcode in this.toFontChar; + let isInFont = this.toFontChar[charcode] !== undefined; // First try the toFontChar map, if it's not there then try falling // back to the char code. fontCharCode = this.toFontChar[charcode] || charcode;