Improve the heuristics, in PartialEvaluator._buildSimpleFontToUnicode, for glyphNames of the Cdd{d}/cdd{d} format (issue 9655)

*Please note:* I've been thinking about possible ways of addressing this issue for a while now, but all of the solutions I came up with became too complicated and thus hurt readability of the code. However, it occured to me that we're essentially trying to add a heuristic *on top* of another heuristic, and that it shouldn't matter how efficient the code is as long as it works. In the PDF file in the issue the Encoding contains glyphNames of the `Cdd` format, which our existing heuristics will treat as base 10 values. However, in this particular file they actually contain base 16 values, which we thus attempt to detect and fix such that text-selection works.
2019-09-29 23:50:58 +02:00 · 2019-09-29 23:50:58 +02:00 · f5be2d62a3
commit f5be2d62a3
parent 0786363b7c
4 changed files with 32 additions and 7 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -1977,7 +1977,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
     * @returns {ToUnicodeMap}
     * @private
     */
-    _buildSimpleFontToUnicode(properties) {
+    _buildSimpleFontToUnicode(properties, forceGlyphs = false) {
      assert(!properties.composite, 'Must be a simple font.');

      let toUnicode = [], charcode, glyphName;
@ -2017,14 +2017,31 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
                code = parseInt(glyphName.substring(1), 16);
              }
              break;
-            case 'C': // Cddd glyph
-            case 'c': // cddd glyph
-              if (glyphName.length >= 3) {
-                code = +glyphName.substring(1);
+            case 'C': // Cdd{d} glyph
+            case 'c': // cdd{d} glyph
+              if (glyphName.length >= 3 && glyphName.length <= 4) {
+                const codeStr = glyphName.substring(1);
+
+                if (forceGlyphs) {
+                  code = parseInt(codeStr, 16);
+                  break;
+                }
+                // Normally the Cdd{d}/cdd{d} glyphName format will contain
+                // regular, i.e. base 10, charCodes (see issue4550.pdf)...
+                code = +codeStr;
+
+                // ... however some PDF generators violate that assumption by
+                // containing glyph, i.e. base 16, codes instead.
+                // In that case we need to re-parse the *entire* encoding to
+                // prevent broken text-selection (fixes issue9655_reduced.pdf).
+                if (Number.isNaN(code) &&
+                    Number.isInteger(parseInt(codeStr, 16))) {
+                  return this._buildSimpleFontToUnicode(properties,
+                                                        /* forceGlyphs */ true);
+                }
              }
              break;
-            default:
-              // 'uniXXXX'/'uXXXX{XX}' glyphs
+            default: // 'uniXXXX'/'uXXXX{XX}' glyphs
              let unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
              if (unicode !== -1) {
                code = unicode;
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -73,6 +73,7 @@
 !issue9291.pdf
 !issue9418.pdf
 !issue9458.pdf
+!issue9655_reduced.pdf
 !issue9915_reduced.pdf
 !issue9940.pdf
 !issue10388_reduced.pdf
--- a/test/pdfs/issue9655_reduced.pdf
+++ b/test/pdfs/issue9655_reduced.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -495,6 +495,13 @@
       "rounds": 1,
       "type": "text"
    },
+    {  "id": "issue9655-text",
+       "file": "pdfs/issue9655_reduced.pdf",
+       "md5": "87259a82cf3cda18e240517ca53c312a",
+       "rounds": 1,
+       "link": false,
+       "type": "text"
+    },
    {  "id": "jai-pdf",
       "file": "pdfs/jai.pdf",
       "md5": "1f5dd128c3757420a881a155f2f8ace3",