Add more non-standard ligatures in the glyphlist.js file (issue 15516)

Note that this PR only adds the "underscore"-variant of *actually existing* ligatures, however the referenced PDF document also uses a couple of non-standard ones (e.g. `ft`, `Th`, and `fh`) that we cannot easily support without larger changes (since they don't have official Unicode-entries).
Given that it's clearly the PDF document, and its fonts, that's the culprit here it's not entirely clear to me that we actually want to attempt a larger refactoring/rewriting of the `glyphlist.js` code, assuming it's even generally possible. Especially when this patch alone already improves our copy-paste behaviour when compared to both Adobe Reader and PDFium, and that this is only the *second* time this sort of bug has been reported.
This commit is contained in:
Jonas Jenwald 2022-09-27 15:19:57 +02:00
parent 5675a6ee64
commit c87f90102c
4 changed files with 23 additions and 1 deletions

View File

@ -1828,10 +1828,13 @@ const getGlyphsUnicode = getArrayLookupTableFactory(function () {
"feicoptic", 0x03e5,
"female", 0x2640,
"ff", 0xfb00,
"f_f", 0xfb00, // Fixes issue 11016.
"f_f", 0xfb00,
"ffi", 0xfb03,
"f_f_i", 0xfb03,
"ffl", 0xfb04,
"f_f_l", 0xfb04,
"fi", 0xfb01,
"f_i", 0xfb01,
"fifteencircle", 0x246e,
"fifteenparen", 0x2482,
"fifteenperiod", 0x2496,
@ -1875,6 +1878,7 @@ const getGlyphsUnicode = getArrayLookupTableFactory(function () {
"fivesuperior", 0x2075,
"fivethai", 0x0e55,
"fl", 0xfb02,
"f_l", 0xfb02,
"florin", 0x0192,
"fmonospace", 0xff46,
"fmsquare", 0x3399,

View File

@ -108,6 +108,7 @@
!issue10542_reduced.pdf
!issue10665_reduced.pdf
!issue11016_reduced.pdf
!issue15516_reduced.pdf
!issue11045.pdf
!bug1057544.pdf
!issue11150_reduced.pdf

Binary file not shown.

View File

@ -2417,6 +2417,23 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy();
});
// TODO: Change this to a `text` reference test instead.
// Currently that doesn't work, since the `XMLSerializer` fails on
// the ASCII "control characters" found in the text-content.
it("gets text content with non-standard ligatures (issue issue15516)", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("issue15516_reduced.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);
expect(text).toEqual("ffi fi ffl ff fl \x07 \x08 Ý");
await loadingTask.destroy();
});
it("gets empty structure tree", async function () {
const tree = await page.getStructTree();