Merge pull request #13393 from Snuffleupagus/adjustToUnicode-hasIncludedToUnicodeMap

Tweak `adjustToUnicode` to allow extending a built-in /ToUnicode map
This commit is contained in:
Jonas Jenwald 2021-06-16 17:06:17 +02:00 committed by GitHub
commit 7fa61c062c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 71 additions and 55 deletions

View File

@ -3178,10 +3178,10 @@ class PartialEvaluator {
} }
/** /**
* @returns {ToUnicodeMap} * @returns {Array}
* @private * @private
*/ */
_buildSimpleFontToUnicode(properties, forceGlyphs = false) { _simpleFontToUnicode(properties, forceGlyphs = false) {
assert(!properties.composite, "Must be a simple font."); assert(!properties.composite, "Must be a simple font.");
const toUnicode = []; const toUnicode = [];
@ -3242,7 +3242,7 @@ class PartialEvaluator {
Number.isNaN(code) && Number.isNaN(code) &&
Number.isInteger(parseInt(codeStr, 16)) Number.isInteger(parseInt(codeStr, 16))
) { ) {
return this._buildSimpleFontToUnicode( return this._simpleFontToUnicode(
properties, properties,
/* forceGlyphs */ true /* forceGlyphs */ true
); );
@ -3275,7 +3275,7 @@ class PartialEvaluator {
} }
toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]); toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
} }
return new ToUnicodeMap(toUnicode); return toUnicode;
} }
/** /**
@ -3284,7 +3284,7 @@ class PartialEvaluator {
* @returns {Promise} A Promise that is resolved with a * @returns {Promise} A Promise that is resolved with a
* {ToUnicodeMap|IdentityToUnicodeMap} object. * {ToUnicodeMap|IdentityToUnicodeMap} object.
*/ */
buildToUnicode(properties) { async buildToUnicode(properties) {
properties.hasIncludedToUnicodeMap = properties.hasIncludedToUnicodeMap =
!!properties.toUnicode && properties.toUnicode.length > 0; !!properties.toUnicode && properties.toUnicode.length > 0;
@ -3294,11 +3294,9 @@ class PartialEvaluator {
// text-extraction. For simple fonts, containing encoding information, // text-extraction. For simple fonts, containing encoding information,
// use a fallback ToUnicode map to improve this (fixes issue8229.pdf). // use a fallback ToUnicode map to improve this (fixes issue8229.pdf).
if (!properties.composite && properties.hasEncoding) { if (!properties.composite && properties.hasEncoding) {
properties.fallbackToUnicode = properties.fallbackToUnicode = this._simpleFontToUnicode(properties);
this._buildSimpleFontToUnicode(properties);
} }
return properties.toUnicode;
return Promise.resolve(properties.toUnicode);
} }
// According to the spec if the font is a simple font we should only map // According to the spec if the font is a simple font we should only map
@ -3307,7 +3305,7 @@ class PartialEvaluator {
// in pratice it seems better to always try to create a toUnicode map // in pratice it seems better to always try to create a toUnicode map
// based of the default encoding. // based of the default encoding.
if (!properties.composite /* is simple font */) { if (!properties.composite /* is simple font */) {
return Promise.resolve(this._buildSimpleFontToUnicode(properties)); return new ToUnicodeMap(this._simpleFontToUnicode(properties));
} }
// If the font is a composite font that uses one of the predefined CMaps // If the font is a composite font that uses one of the predefined CMaps
@ -3330,22 +3328,20 @@ class PartialEvaluator {
// b) Obtain the registry and ordering of the character collection used // b) Obtain the registry and ordering of the character collection used
// by the fonts CMap (for example, Adobe and Japan1) from its // by the fonts CMap (for example, Adobe and Japan1) from its
// CIDSystemInfo dictionary. // CIDSystemInfo dictionary.
const registry = properties.cidSystemInfo.registry; const { registry, ordering } = properties.cidSystemInfo;
const ordering = properties.cidSystemInfo.ordering;
// c) Construct a second CMap name by concatenating the registry and // c) Construct a second CMap name by concatenating the registry and
// ordering obtained in step (b) in the format registryorderingUCS2 // ordering obtained in step (b) in the format registryorderingUCS2
// (for example, AdobeJapan1UCS2). // (for example, AdobeJapan1UCS2).
const ucs2CMapName = Name.get(registry + "-" + ordering + "-UCS2"); const ucs2CMapName = Name.get(`${registry}-${ordering}-UCS2`);
// d) Obtain the CMap with the name constructed in step (c) (available // d) Obtain the CMap with the name constructed in step (c) (available
// from the ASN Web site; see the Bibliography). // from the ASN Web site; see the Bibliography).
return CMapFactory.create({ const ucs2CMap = await CMapFactory.create({
encoding: ucs2CMapName, encoding: ucs2CMapName,
fetchBuiltInCMap: this._fetchBuiltInCMapBound, fetchBuiltInCMap: this._fetchBuiltInCMapBound,
useCMap: null, useCMap: null,
}).then(function (ucs2CMap) { });
const cMap = properties.cMap;
const toUnicode = []; const toUnicode = [];
cMap.forEach(function (charcode, cid) { properties.cMap.forEach(function (charcode, cid) {
if (cid > 0xffff) { if (cid > 0xffff) {
throw new FormatError("Max size of CID is 65,535"); throw new FormatError("Max size of CID is 65,535");
} }
@ -3359,13 +3355,10 @@ class PartialEvaluator {
} }
}); });
return new ToUnicodeMap(toUnicode); return new ToUnicodeMap(toUnicode);
});
} }
// The viewer's choice, just use an identity map. // The viewer's choice, just use an identity map.
return Promise.resolve( return new IdentityToUnicodeMap(properties.firstChar, properties.lastChar);
new IdentityToUnicodeMap(properties.firstChar, properties.lastChar)
);
} }
readToUnicode(cmapObj) { readToUnicode(cmapObj) {

View File

@ -135,9 +135,6 @@ function adjustToUnicode(properties, builtInEncoding) {
if (properties.isInternalFont) { if (properties.isInternalFont) {
return; return;
} }
if (properties.hasIncludedToUnicodeMap) {
return; // The font dictionary has a `ToUnicode` entry.
}
if (builtInEncoding === properties.defaultEncoding) { if (builtInEncoding === properties.defaultEncoding) {
return; // No point in trying to adjust `toUnicode` if the encodings match. return; // No point in trying to adjust `toUnicode` if the encodings match.
} }
@ -147,20 +144,51 @@ function adjustToUnicode(properties, builtInEncoding) {
const toUnicode = [], const toUnicode = [],
glyphsUnicodeMap = getGlyphsUnicode(); glyphsUnicodeMap = getGlyphsUnicode();
for (const charCode in builtInEncoding) { for (const charCode in builtInEncoding) {
if (properties.hasIncludedToUnicodeMap) {
if (properties.toUnicode.has(charCode)) {
continue; // The font dictionary has a `ToUnicode` entry.
}
} else {
if ( if (
properties.hasEncoding && properties.hasEncoding &&
properties.differences[charCode] !== undefined properties.differences[charCode] !== undefined
) { ) {
continue; // The font dictionary has an `Encoding`/`Differences` entry. continue; // The font dictionary has an `Encoding`/`Differences` entry.
} }
}
const glyphName = builtInEncoding[charCode]; const glyphName = builtInEncoding[charCode];
const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
if (unicode !== -1) { if (unicode !== -1) {
toUnicode[charCode] = String.fromCharCode(unicode); toUnicode[charCode] = String.fromCharCode(unicode);
} }
} }
if (toUnicode.length > 0) {
properties.toUnicode.amend(toUnicode); properties.toUnicode.amend(toUnicode);
} }
}
/**
* NOTE: This function should only be called at the *end* of font-parsing,
* after e.g. `adjustToUnicode` has run, to prevent any issues.
*/
function amendFallbackToUnicode(properties) {
if (!properties.fallbackToUnicode) {
return;
}
if (properties.toUnicode instanceof IdentityToUnicodeMap) {
return;
}
const toUnicode = [];
for (const charCode in properties.fallbackToUnicode) {
if (properties.toUnicode.has(charCode)) {
continue; // The font dictionary has a `ToUnicode` entry.
}
toUnicode[charCode] = properties.fallbackToUnicode[charCode];
}
if (toUnicode.length > 0) {
properties.toUnicode.amend(toUnicode);
}
}
class Glyph { class Glyph {
constructor( constructor(
@ -849,8 +877,6 @@ class Font {
this.defaultEncoding = properties.defaultEncoding; this.defaultEncoding = properties.defaultEncoding;
this.toUnicode = properties.toUnicode; this.toUnicode = properties.toUnicode;
this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap();
this.toFontChar = []; this.toFontChar = [];
if (properties.type === "Type3") { if (properties.type === "Type3") {
@ -936,6 +962,7 @@ class Font {
return; return;
} }
amendFallbackToUnicode(properties);
this.data = data; this.data = data;
this.fontType = getFontType(type, subtype, properties.isStandardFont); this.fontType = getFontType(type, subtype, properties.isStandardFont);
@ -1094,6 +1121,8 @@ class Font {
} }
this.toFontChar = map; this.toFontChar = map;
} }
amendFallbackToUnicode(properties);
this.loadedName = fontName.split("-")[0]; this.loadedName = fontName.split("-")[0];
this.fontType = getFontType(type, subtype, properties.isStandardFont); this.fontType = getFontType(type, subtype, properties.isStandardFont);
} }
@ -2545,12 +2574,9 @@ class Font {
const glyphsUnicodeMap = getGlyphsUnicode(); const glyphsUnicodeMap = getGlyphsUnicode();
for (let charCode = 0; charCode < 256; charCode++) { for (let charCode = 0; charCode < 256; charCode++) {
let glyphName; let glyphName;
if (this.differences && charCode in this.differences) { if (this.differences[charCode] !== undefined) {
glyphName = this.differences[charCode]; glyphName = this.differences[charCode];
} else if ( } else if (baseEncoding[charCode] !== "") {
charCode in baseEncoding &&
baseEncoding[charCode] !== ""
) {
glyphName = baseEncoding[charCode]; glyphName = baseEncoding[charCode];
} else { } else {
glyphName = StandardEncoding[charCode]; glyphName = StandardEncoding[charCode];
@ -2955,15 +2981,12 @@ class Font {
width = isNum(width) ? width : this.defaultWidth; width = isNum(width) ? width : this.defaultWidth;
const vmetric = this.vmetrics && this.vmetrics[widthCode]; const vmetric = this.vmetrics && this.vmetrics[widthCode];
let unicode = let unicode = this.toUnicode.get(charcode) || charcode;
this.toUnicode.get(charcode) ||
this.fallbackToUnicode.get(charcode) ||
charcode;
if (typeof unicode === "number") { if (typeof unicode === "number") {
unicode = String.fromCharCode(unicode); unicode = String.fromCharCode(unicode);
} }
let isInFont = charcode in this.toFontChar; let isInFont = this.toFontChar[charcode] !== undefined;
// First try the toFontChar map, if it's not there then try falling // First try the toFontChar map, if it's not there then try falling
// back to the char code. // back to the char code.
fontCharCode = this.toFontChar[charcode] || charcode; fontCharCode = this.toFontChar[charcode] || charcode;