Support multi-byte ToUnicode entries, when using predefined CMaps (issue 16176)
Hopefully this makes sense, since we already "create" multi-byte ToUnicode entries in other cases (see e.g. the `getNormalizedUnicodes` table).
This commit is contained in:
parent
b1e0253f29
commit
d4bcfe8c16
@ -3632,7 +3632,8 @@ class PartialEvaluator {
|
|||||||
fetchBuiltInCMap: this._fetchBuiltInCMapBound,
|
fetchBuiltInCMap: this._fetchBuiltInCMapBound,
|
||||||
useCMap: null,
|
useCMap: null,
|
||||||
});
|
});
|
||||||
const toUnicode = [];
|
const toUnicode = [],
|
||||||
|
buf = [];
|
||||||
properties.cMap.forEach(function (charcode, cid) {
|
properties.cMap.forEach(function (charcode, cid) {
|
||||||
if (cid > 0xffff) {
|
if (cid > 0xffff) {
|
||||||
throw new FormatError("Max size of CID is 65,535");
|
throw new FormatError("Max size of CID is 65,535");
|
||||||
@ -3641,9 +3642,12 @@ class PartialEvaluator {
|
|||||||
// obtained in step (d), producing a Unicode value.
|
// obtained in step (d), producing a Unicode value.
|
||||||
const ucs2 = ucs2CMap.lookup(cid);
|
const ucs2 = ucs2CMap.lookup(cid);
|
||||||
if (ucs2) {
|
if (ucs2) {
|
||||||
toUnicode[charcode] = String.fromCharCode(
|
buf.length = 0;
|
||||||
(ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1)
|
// Support multi-byte entries (fixes issue16176.pdf).
|
||||||
);
|
for (let i = 0, ii = ucs2.length; i < ii; i += 2) {
|
||||||
|
buf.push((ucs2.charCodeAt(i) << 8) + ucs2.charCodeAt(i + 1));
|
||||||
|
}
|
||||||
|
toUnicode[charcode] = String.fromCharCode(...buf);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return new ToUnicodeMap(toUnicode);
|
return new ToUnicodeMap(toUnicode);
|
||||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -118,6 +118,7 @@
|
|||||||
!issue6127.pdf
|
!issue6127.pdf
|
||||||
!issue7891_bc0.pdf
|
!issue7891_bc0.pdf
|
||||||
!issue11242_reduced.pdf
|
!issue11242_reduced.pdf
|
||||||
|
!issue16176.pdf
|
||||||
!issue11279.pdf
|
!issue11279.pdf
|
||||||
!issue11362.pdf
|
!issue11362.pdf
|
||||||
!issue13325_reduced.pdf
|
!issue13325_reduced.pdf
|
||||||
|
BIN
test/pdfs/issue16176.pdf
Normal file
BIN
test/pdfs/issue16176.pdf
Normal file
Binary file not shown.
@ -28,6 +28,7 @@ import {
|
|||||||
} from "../../src/shared/util.js";
|
} from "../../src/shared/util.js";
|
||||||
import {
|
import {
|
||||||
buildGetDocumentParams,
|
buildGetDocumentParams,
|
||||||
|
CMAP_URL,
|
||||||
DefaultFileReaderFactory,
|
DefaultFileReaderFactory,
|
||||||
TEST_PDFS_PATH,
|
TEST_PDFS_PATH,
|
||||||
} from "./test_utils.js";
|
} from "./test_utils.js";
|
||||||
@ -2593,6 +2594,23 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
|||||||
await loadingTask.destroy();
|
await loadingTask.destroy();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("gets text content with multi-byte entries, using predefined CMaps (issue 16176)", async function () {
|
||||||
|
const loadingTask = getDocument(
|
||||||
|
buildGetDocumentParams("issue16176.pdf", {
|
||||||
|
cMapUrl: CMAP_URL,
|
||||||
|
useWorkerFetch: false,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
const pdfDoc = await loadingTask.promise;
|
||||||
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
|
const { items } = await pdfPage.getTextContent();
|
||||||
|
const text = mergeText(items);
|
||||||
|
|
||||||
|
expect(text).toEqual("𠮷");
|
||||||
|
|
||||||
|
await loadingTask.destroy();
|
||||||
|
});
|
||||||
|
|
||||||
it("gets empty structure tree", async function () {
|
it("gets empty structure tree", async function () {
|
||||||
const tree = await page.getStructTree();
|
const tree = await page.getStructTree();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user