Merge pull request #16186 from Snuffleupagus/issue-16176

Support multi-byte ToUnicode entries, when using predefined CMaps (issue 16176)
This commit is contained in:
Jonas Jenwald 2023-03-21 22:17:18 +01:00 committed by GitHub
commit 9321758d91
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 27 additions and 4 deletions

View File

@ -3632,7 +3632,8 @@ class PartialEvaluator {
fetchBuiltInCMap: this._fetchBuiltInCMapBound, fetchBuiltInCMap: this._fetchBuiltInCMapBound,
useCMap: null, useCMap: null,
}); });
const toUnicode = []; const toUnicode = [],
buf = [];
properties.cMap.forEach(function (charcode, cid) { properties.cMap.forEach(function (charcode, cid) {
if (cid > 0xffff) { if (cid > 0xffff) {
throw new FormatError("Max size of CID is 65,535"); throw new FormatError("Max size of CID is 65,535");
@ -3641,9 +3642,12 @@ class PartialEvaluator {
// obtained in step (d), producing a Unicode value. // obtained in step (d), producing a Unicode value.
const ucs2 = ucs2CMap.lookup(cid); const ucs2 = ucs2CMap.lookup(cid);
if (ucs2) { if (ucs2) {
toUnicode[charcode] = String.fromCharCode( buf.length = 0;
(ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1) // Support multi-byte entries (fixes issue16176.pdf).
); for (let i = 0, ii = ucs2.length; i < ii; i += 2) {
buf.push((ucs2.charCodeAt(i) << 8) + ucs2.charCodeAt(i + 1));
}
toUnicode[charcode] = String.fromCharCode(...buf);
} }
}); });
return new ToUnicodeMap(toUnicode); return new ToUnicodeMap(toUnicode);

View File

@ -118,6 +118,7 @@
!issue6127.pdf !issue6127.pdf
!issue7891_bc0.pdf !issue7891_bc0.pdf
!issue11242_reduced.pdf !issue11242_reduced.pdf
!issue16176.pdf
!issue11279.pdf !issue11279.pdf
!issue11362.pdf !issue11362.pdf
!issue13325_reduced.pdf !issue13325_reduced.pdf

BIN
test/pdfs/issue16176.pdf Normal file

Binary file not shown.

View File

@ -29,6 +29,7 @@ import {
} from "../../src/shared/util.js"; } from "../../src/shared/util.js";
import { import {
buildGetDocumentParams, buildGetDocumentParams,
CMAP_URL,
DefaultFileReaderFactory, DefaultFileReaderFactory,
TEST_PDFS_PATH, TEST_PDFS_PATH,
} from "./test_utils.js"; } from "./test_utils.js";
@ -2623,6 +2624,23 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy(); await loadingTask.destroy();
}); });
it("gets text content with multi-byte entries, using predefined CMaps (issue 16176)", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("issue16176.pdf", {
cMapUrl: CMAP_URL,
useWorkerFetch: false,
})
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);
expect(text).toEqual("𠮷");
await loadingTask.destroy();
});
it("gets empty structure tree", async function () { it("gets empty structure tree", async function () {
const tree = await page.getStructTree(); const tree = await page.getStructTree();