Improve the heuristics, in PartialEvaluator._buildSimpleFontToUnicode, for glyphNames of the Cdd{d}/cdd{d} format (issue 9655)

*Please note:* I've been thinking about possible ways of addressing this issue for a while now, but all of the solutions I came up with became too complicated and thus hurt readability of the code.
However, it occured to me that we're essentially trying to add a heuristic *on top* of another heuristic, and that it shouldn't matter how efficient the code is as long as it works.

In the PDF file in the issue the Encoding contains glyphNames of the `Cdd` format, which our existing heuristics will treat as base 10 values. However, in this particular file they actually contain base 16 values, which we thus attempt to detect and fix such that text-selection works.
This commit is contained in:
Jonas Jenwald 2019-09-29 23:50:58 +02:00
parent 0786363b7c
commit f5be2d62a3
4 changed files with 32 additions and 7 deletions

View File

@ -1977,7 +1977,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
* @returns {ToUnicodeMap} * @returns {ToUnicodeMap}
* @private * @private
*/ */
_buildSimpleFontToUnicode(properties) { _buildSimpleFontToUnicode(properties, forceGlyphs = false) {
assert(!properties.composite, 'Must be a simple font.'); assert(!properties.composite, 'Must be a simple font.');
let toUnicode = [], charcode, glyphName; let toUnicode = [], charcode, glyphName;
@ -2017,14 +2017,31 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
code = parseInt(glyphName.substring(1), 16); code = parseInt(glyphName.substring(1), 16);
} }
break; break;
case 'C': // Cddd glyph case 'C': // Cdd{d} glyph
case 'c': // cddd glyph case 'c': // cdd{d} glyph
if (glyphName.length >= 3) { if (glyphName.length >= 3 && glyphName.length <= 4) {
code = +glyphName.substring(1); const codeStr = glyphName.substring(1);
if (forceGlyphs) {
code = parseInt(codeStr, 16);
break;
}
// Normally the Cdd{d}/cdd{d} glyphName format will contain
// regular, i.e. base 10, charCodes (see issue4550.pdf)...
code = +codeStr;
// ... however some PDF generators violate that assumption by
// containing glyph, i.e. base 16, codes instead.
// In that case we need to re-parse the *entire* encoding to
// prevent broken text-selection (fixes issue9655_reduced.pdf).
if (Number.isNaN(code) &&
Number.isInteger(parseInt(codeStr, 16))) {
return this._buildSimpleFontToUnicode(properties,
/* forceGlyphs */ true);
}
} }
break; break;
default: default: // 'uniXXXX'/'uXXXX{XX}' glyphs
// 'uniXXXX'/'uXXXX{XX}' glyphs
let unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); let unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
if (unicode !== -1) { if (unicode !== -1) {
code = unicode; code = unicode;

View File

@ -73,6 +73,7 @@
!issue9291.pdf !issue9291.pdf
!issue9418.pdf !issue9418.pdf
!issue9458.pdf !issue9458.pdf
!issue9655_reduced.pdf
!issue9915_reduced.pdf !issue9915_reduced.pdf
!issue9940.pdf !issue9940.pdf
!issue10388_reduced.pdf !issue10388_reduced.pdf

Binary file not shown.

View File

@ -495,6 +495,13 @@
"rounds": 1, "rounds": 1,
"type": "text" "type": "text"
}, },
{ "id": "issue9655-text",
"file": "pdfs/issue9655_reduced.pdf",
"md5": "87259a82cf3cda18e240517ca53c312a",
"rounds": 1,
"link": false,
"type": "text"
},
{ "id": "jai-pdf", { "id": "jai-pdf",
"file": "pdfs/jai.pdf", "file": "pdfs/jai.pdf",
"md5": "1f5dd128c3757420a881a155f2f8ace3", "md5": "1f5dd128c3757420a881a155f2f8ace3",