diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 2c3c80964..9bc1e75ef 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1977,7 +1977,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { * @returns {ToUnicodeMap} * @private */ - _buildSimpleFontToUnicode(properties) { + _buildSimpleFontToUnicode(properties, forceGlyphs = false) { assert(!properties.composite, 'Must be a simple font.'); let toUnicode = [], charcode, glyphName; @@ -2017,14 +2017,31 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { code = parseInt(glyphName.substring(1), 16); } break; - case 'C': // Cddd glyph - case 'c': // cddd glyph - if (glyphName.length >= 3) { - code = +glyphName.substring(1); + case 'C': // Cdd{d} glyph + case 'c': // cdd{d} glyph + if (glyphName.length >= 3 && glyphName.length <= 4) { + const codeStr = glyphName.substring(1); + + if (forceGlyphs) { + code = parseInt(codeStr, 16); + break; + } + // Normally the Cdd{d}/cdd{d} glyphName format will contain + // regular, i.e. base 10, charCodes (see issue4550.pdf)... + code = +codeStr; + + // ... however some PDF generators violate that assumption by + // containing glyph, i.e. base 16, codes instead. + // In that case we need to re-parse the *entire* encoding to + // prevent broken text-selection (fixes issue9655_reduced.pdf). + if (Number.isNaN(code) && + Number.isInteger(parseInt(codeStr, 16))) { + return this._buildSimpleFontToUnicode(properties, + /* forceGlyphs */ true); + } } break; - default: - // 'uniXXXX'/'uXXXX{XX}' glyphs + default: // 'uniXXXX'/'uXXXX{XX}' glyphs let unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); if (unicode !== -1) { code = unicode; diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 46f0aa1a6..0ca334476 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -73,6 +73,7 @@ !issue9291.pdf !issue9418.pdf !issue9458.pdf +!issue9655_reduced.pdf !issue9915_reduced.pdf !issue9940.pdf !issue10388_reduced.pdf diff --git a/test/pdfs/issue9655_reduced.pdf b/test/pdfs/issue9655_reduced.pdf new file mode 100644 index 000000000..535fc1acf Binary files /dev/null and b/test/pdfs/issue9655_reduced.pdf differ diff --git a/test/test_manifest.json b/test/test_manifest.json index 4d2e3d961..ada0a22c4 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -495,6 +495,13 @@ "rounds": 1, "type": "text" }, + { "id": "issue9655-text", + "file": "pdfs/issue9655_reduced.pdf", + "md5": "87259a82cf3cda18e240517ca53c312a", + "rounds": 1, + "link": false, + "type": "text" + }, { "id": "jai-pdf", "file": "pdfs/jai.pdf", "md5": "1f5dd128c3757420a881a155f2f8ace3",