Improve the heuristics, in PartialEvaluator._buildSimpleFontToUnicode
, for glyphNames of the Cdd{d}/cdd{d} format (issue 9655)
*Please note:* I've been thinking about possible ways of addressing this issue for a while now, but all of the solutions I came up with became too complicated and thus hurt readability of the code. However, it occured to me that we're essentially trying to add a heuristic *on top* of another heuristic, and that it shouldn't matter how efficient the code is as long as it works. In the PDF file in the issue the Encoding contains glyphNames of the `Cdd` format, which our existing heuristics will treat as base 10 values. However, in this particular file they actually contain base 16 values, which we thus attempt to detect and fix such that text-selection works.
This commit is contained in:
parent
0786363b7c
commit
f5be2d62a3
@ -1977,7 +1977,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
* @returns {ToUnicodeMap}
|
||||
* @private
|
||||
*/
|
||||
_buildSimpleFontToUnicode(properties) {
|
||||
_buildSimpleFontToUnicode(properties, forceGlyphs = false) {
|
||||
assert(!properties.composite, 'Must be a simple font.');
|
||||
|
||||
let toUnicode = [], charcode, glyphName;
|
||||
@ -2017,14 +2017,31 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
code = parseInt(glyphName.substring(1), 16);
|
||||
}
|
||||
break;
|
||||
case 'C': // Cddd glyph
|
||||
case 'c': // cddd glyph
|
||||
if (glyphName.length >= 3) {
|
||||
code = +glyphName.substring(1);
|
||||
case 'C': // Cdd{d} glyph
|
||||
case 'c': // cdd{d} glyph
|
||||
if (glyphName.length >= 3 && glyphName.length <= 4) {
|
||||
const codeStr = glyphName.substring(1);
|
||||
|
||||
if (forceGlyphs) {
|
||||
code = parseInt(codeStr, 16);
|
||||
break;
|
||||
}
|
||||
// Normally the Cdd{d}/cdd{d} glyphName format will contain
|
||||
// regular, i.e. base 10, charCodes (see issue4550.pdf)...
|
||||
code = +codeStr;
|
||||
|
||||
// ... however some PDF generators violate that assumption by
|
||||
// containing glyph, i.e. base 16, codes instead.
|
||||
// In that case we need to re-parse the *entire* encoding to
|
||||
// prevent broken text-selection (fixes issue9655_reduced.pdf).
|
||||
if (Number.isNaN(code) &&
|
||||
Number.isInteger(parseInt(codeStr, 16))) {
|
||||
return this._buildSimpleFontToUnicode(properties,
|
||||
/* forceGlyphs */ true);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// 'uniXXXX'/'uXXXX{XX}' glyphs
|
||||
default: // 'uniXXXX'/'uXXXX{XX}' glyphs
|
||||
let unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
|
||||
if (unicode !== -1) {
|
||||
code = unicode;
|
||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -73,6 +73,7 @@
|
||||
!issue9291.pdf
|
||||
!issue9418.pdf
|
||||
!issue9458.pdf
|
||||
!issue9655_reduced.pdf
|
||||
!issue9915_reduced.pdf
|
||||
!issue9940.pdf
|
||||
!issue10388_reduced.pdf
|
||||
|
BIN
test/pdfs/issue9655_reduced.pdf
Normal file
BIN
test/pdfs/issue9655_reduced.pdf
Normal file
Binary file not shown.
@ -495,6 +495,13 @@
|
||||
"rounds": 1,
|
||||
"type": "text"
|
||||
},
|
||||
{ "id": "issue9655-text",
|
||||
"file": "pdfs/issue9655_reduced.pdf",
|
||||
"md5": "87259a82cf3cda18e240517ca53c312a",
|
||||
"rounds": 1,
|
||||
"link": false,
|
||||
"type": "text"
|
||||
},
|
||||
{ "id": "jai-pdf",
|
||||
"file": "pdfs/jai.pdf",
|
||||
"md5": "1f5dd128c3757420a881a155f2f8ace3",
|
||||
|
Loading…
Reference in New Issue
Block a user