diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 1883db8de..b78b3bc13 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2021,6 +2021,14 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // Section 9.10.2 Mapping Character Codes to Unicode Values if (properties.hasIncludedToUnicodeMap) { + // Some fonts contain incomplete ToUnicode data, causing issues with + // text-extraction. For simple fonts, containing encoding information, + // use a fallback ToUnicode map to improve this (fixes issue8229.pdf). + if (!properties.composite && properties.hasEncoding) { + properties.fallbackToUnicode = + this._buildSimpleFontToUnicode(properties); + } + return Promise.resolve(properties.toUnicode); } diff --git a/src/core/fonts.js b/src/core/fonts.js index 475be25a3..2afa54d1d 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -211,9 +211,9 @@ var Glyph = (function GlyphClosure() { })(); var ToUnicodeMap = (function ToUnicodeMapClosure() { - function ToUnicodeMap(cmap) { + function ToUnicodeMap(cmap = []) { // The elements of this._map can be integers or strings, depending on how - // |cmap| was created. + // `cmap` was created. this._map = cmap; } @@ -516,6 +516,7 @@ var Font = (function FontClosure() { this.defaultEncoding = properties.defaultEncoding; this.toUnicode = properties.toUnicode; + this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap(); this.toFontChar = []; @@ -2766,7 +2767,8 @@ var Font = (function FontClosure() { width = isNum(width) ? width : this.defaultWidth; var vmetric = this.vmetrics && this.vmetrics[widthCode]; - var unicode = this.toUnicode.get(charcode) || charcode; + let unicode = this.toUnicode.get(charcode) || + this.fallbackToUnicode.get(charcode) || charcode; if (typeof unicode === 'number') { unicode = String.fromCharCode(unicode); } diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index bed1b9c37..8fd4fc0d6 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -53,6 +53,7 @@ !issue8061.pdf !issue8088.pdf !issue8125.pdf +!issue8229.pdf !issue8372.pdf !issue8424.pdf !issue8480.pdf diff --git a/test/pdfs/issue8229.pdf b/test/pdfs/issue8229.pdf new file mode 100644 index 000000000..e50487b12 Binary files /dev/null and b/test/pdfs/issue8229.pdf differ diff --git a/test/test_manifest.json b/test/test_manifest.json index 63abe91e8..ee7c626ff 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -1438,6 +1438,13 @@ "link": false, "type": "text" }, + { "id": "issue8229", + "file": "pdfs/issue8229.pdf", + "md5": "a729f663782e87ebc1efad0755ebf6a5", + "rounds": 1, + "link": false, + "type": "text" + }, { "id": "ShowText-ShadingPattern", "file": "pdfs/ShowText-ShadingPattern.pdf", "md5": "fe683725db037ffe19d390969610a652",