diff --git a/src/core/evaluator.js b/src/core/evaluator.js index f1ec82a96..8e389119f 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3632,7 +3632,8 @@ class PartialEvaluator { fetchBuiltInCMap: this._fetchBuiltInCMapBound, useCMap: null, }); - const toUnicode = []; + const toUnicode = [], + buf = []; properties.cMap.forEach(function (charcode, cid) { if (cid > 0xffff) { throw new FormatError("Max size of CID is 65,535"); @@ -3641,9 +3642,12 @@ class PartialEvaluator { // obtained in step (d), producing a Unicode value. const ucs2 = ucs2CMap.lookup(cid); if (ucs2) { - toUnicode[charcode] = String.fromCharCode( - (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1) - ); + buf.length = 0; + // Support multi-byte entries (fixes issue16176.pdf). + for (let i = 0, ii = ucs2.length; i < ii; i += 2) { + buf.push((ucs2.charCodeAt(i) << 8) + ucs2.charCodeAt(i + 1)); + } + toUnicode[charcode] = String.fromCharCode(...buf); } }); return new ToUnicodeMap(toUnicode); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 55c567aee..679f1e97b 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -118,6 +118,7 @@ !issue6127.pdf !issue7891_bc0.pdf !issue11242_reduced.pdf +!issue16176.pdf !issue11279.pdf !issue11362.pdf !issue13325_reduced.pdf diff --git a/test/pdfs/issue16176.pdf b/test/pdfs/issue16176.pdf new file mode 100644 index 000000000..0558900e9 Binary files /dev/null and b/test/pdfs/issue16176.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 293589e91..3bf5970ec 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -29,6 +29,7 @@ import { } from "../../src/shared/util.js"; import { buildGetDocumentParams, + CMAP_URL, DefaultFileReaderFactory, TEST_PDFS_PATH, } from "./test_utils.js"; @@ -2623,6 +2624,23 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content with multi-byte entries, using predefined CMaps (issue 16176)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue16176.pdf", { + cMapUrl: CMAP_URL, + useWorkerFetch: false, + }) + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + const text = mergeText(items); + + expect(text).toEqual("𠮷"); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();