Slightly re-factor PartialEvaluator._simpleFontToUnicode

Given the sheer number of heuristics added to this method over the years, moving the *valid* unicode found case to the top should improve readability of the code.
This commit is contained in:
Jonas Jenwald 2022-10-05 12:52:22 +02:00
parent c6cc7c6e6a
commit fa47d4b9b1

View File

@ -3526,77 +3526,77 @@ class PartialEvaluator {
for (const charcode in encoding) {
// a) Map the character code to a character name.
let glyphName = encoding[charcode];
// b) Look up the character name in the Adobe Glyph List (see the
// Bibliography) to obtain the corresponding Unicode value.
if (glyphName === "") {
continue;
} else if (glyphsUnicodeMap[glyphName] === undefined) {
// (undocumented) c) Few heuristics to recognize unknown glyphs
// NOTE: Adobe Reader does not do this step, but OSX Preview does
let code = 0;
switch (glyphName[0]) {
case "G": // Gxx glyph
if (glyphName.length === 3) {
code = parseInt(glyphName.substring(1), 16);
}
break;
case "g": // g00xx glyph
if (glyphName.length === 5) {
code = parseInt(glyphName.substring(1), 16);
}
break;
case "C": // Cdd{d} glyph
case "c": // cdd{d} glyph
if (glyphName.length >= 3 && glyphName.length <= 4) {
const codeStr = glyphName.substring(1);
if (forceGlyphs) {
code = parseInt(codeStr, 16);
break;
}
// Normally the Cdd{d}/cdd{d} glyphName format will contain
// regular, i.e. base 10, charCodes (see issue4550.pdf)...
code = +codeStr;
// ... however some PDF generators violate that assumption by
// containing glyph, i.e. base 16, codes instead.
// In that case we need to re-parse the *entire* encoding to
// prevent broken text-selection (fixes issue9655_reduced.pdf).
if (
Number.isNaN(code) &&
Number.isInteger(parseInt(codeStr, 16))
) {
return this._simpleFontToUnicode(
properties,
/* forceGlyphs */ true
);
}
}
break;
default: // 'uniXXXX'/'uXXXX{XX}' glyphs
const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
if (unicode !== -1) {
code = unicode;
}
}
if (code > 0 && code <= 0x10ffff && Number.isInteger(code)) {
// If `baseEncodingName` is one the predefined encodings, and `code`
// equals `charcode`, using the glyph defined in the baseEncoding
// seems to yield a better `toUnicode` mapping (fixes issue 5070).
if (baseEncodingName && code === +charcode) {
const baseEncoding = getEncoding(baseEncodingName);
if (baseEncoding && (glyphName = baseEncoding[charcode])) {
toUnicode[charcode] = String.fromCharCode(
glyphsUnicodeMap[glyphName]
);
continue;
}
}
toUnicode[charcode] = String.fromCodePoint(code);
}
}
// b) Look up the character name in the Adobe Glyph List (see the
// Bibliography) to obtain the corresponding Unicode value.
let unicode = glyphsUnicodeMap[glyphName];
if (unicode !== undefined) {
toUnicode[charcode] = String.fromCharCode(unicode);
continue;
}
toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
// (undocumented) c) Few heuristics to recognize unknown glyphs
// NOTE: Adobe Reader does not do this step, but OSX Preview does
let code = 0;
switch (glyphName[0]) {
case "G": // Gxx glyph
if (glyphName.length === 3) {
code = parseInt(glyphName.substring(1), 16);
}
break;
case "g": // g00xx glyph
if (glyphName.length === 5) {
code = parseInt(glyphName.substring(1), 16);
}
break;
case "C": // Cdd{d} glyph
case "c": // cdd{d} glyph
if (glyphName.length >= 3 && glyphName.length <= 4) {
const codeStr = glyphName.substring(1);
if (forceGlyphs) {
code = parseInt(codeStr, 16);
break;
}
// Normally the Cdd{d}/cdd{d} glyphName format will contain
// regular, i.e. base 10, charCodes (see issue4550.pdf)...
code = +codeStr;
// ... however some PDF generators violate that assumption by
// containing glyph, i.e. base 16, codes instead.
// In that case we need to re-parse the *entire* encoding to
// prevent broken text-selection (fixes issue9655_reduced.pdf).
if (Number.isNaN(code) && Number.isInteger(parseInt(codeStr, 16))) {
return this._simpleFontToUnicode(
properties,
/* forceGlyphs */ true
);
}
}
break;
case "u": // 'uniXXXX'/'uXXXX{XX}' glyphs
unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
if (unicode !== -1) {
code = unicode;
}
break;
}
if (code > 0 && code <= 0x10ffff && Number.isInteger(code)) {
// If `baseEncodingName` is one the predefined encodings, and `code`
// equals `charcode`, using the glyph defined in the baseEncoding
// seems to yield a better `toUnicode` mapping (fixes issue 5070).
if (baseEncodingName && code === +charcode) {
const baseEncoding = getEncoding(baseEncodingName);
if (baseEncoding && (glyphName = baseEncoding[charcode])) {
toUnicode[charcode] = String.fromCharCode(
glyphsUnicodeMap[glyphName]
);
continue;
}
}
toUnicode[charcode] = String.fromCodePoint(code);
}
}
return toUnicode;
}