Add more non-standard ligatures in the glyphlist.js
file (issue 15516)
Note that this PR only adds the "underscore"-variant of *actually existing* ligatures, however the referenced PDF document also uses a couple of non-standard ones (e.g. `ft`, `Th`, and `fh`) that we cannot easily support without larger changes (since they don't have official Unicode-entries). Given that it's clearly the PDF document, and its fonts, that's the culprit here it's not entirely clear to me that we actually want to attempt a larger refactoring/rewriting of the `glyphlist.js` code, assuming it's even generally possible. Especially when this patch alone already improves our copy-paste behaviour when compared to both Adobe Reader and PDFium, and that this is only the *second* time this sort of bug has been reported.
This commit is contained in:
parent
5675a6ee64
commit
c87f90102c
@ -1828,10 +1828,13 @@ const getGlyphsUnicode = getArrayLookupTableFactory(function () {
|
||||
"feicoptic", 0x03e5,
|
||||
"female", 0x2640,
|
||||
"ff", 0xfb00,
|
||||
"f_f", 0xfb00, // Fixes issue 11016.
|
||||
"f_f", 0xfb00,
|
||||
"ffi", 0xfb03,
|
||||
"f_f_i", 0xfb03,
|
||||
"ffl", 0xfb04,
|
||||
"f_f_l", 0xfb04,
|
||||
"fi", 0xfb01,
|
||||
"f_i", 0xfb01,
|
||||
"fifteencircle", 0x246e,
|
||||
"fifteenparen", 0x2482,
|
||||
"fifteenperiod", 0x2496,
|
||||
@ -1875,6 +1878,7 @@ const getGlyphsUnicode = getArrayLookupTableFactory(function () {
|
||||
"fivesuperior", 0x2075,
|
||||
"fivethai", 0x0e55,
|
||||
"fl", 0xfb02,
|
||||
"f_l", 0xfb02,
|
||||
"florin", 0x0192,
|
||||
"fmonospace", 0xff46,
|
||||
"fmsquare", 0x3399,
|
||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -108,6 +108,7 @@
|
||||
!issue10542_reduced.pdf
|
||||
!issue10665_reduced.pdf
|
||||
!issue11016_reduced.pdf
|
||||
!issue15516_reduced.pdf
|
||||
!issue11045.pdf
|
||||
!bug1057544.pdf
|
||||
!issue11150_reduced.pdf
|
||||
|
BIN
test/pdfs/issue15516_reduced.pdf
Normal file
BIN
test/pdfs/issue15516_reduced.pdf
Normal file
Binary file not shown.
@ -2417,6 +2417,23 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
// TODO: Change this to a `text` reference test instead.
|
||||
// Currently that doesn't work, since the `XMLSerializer` fails on
|
||||
// the ASCII "control characters" found in the text-content.
|
||||
it("gets text content with non-standard ligatures (issue issue15516)", async function () {
|
||||
const loadingTask = getDocument(
|
||||
buildGetDocumentParams("issue15516_reduced.pdf")
|
||||
);
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(text).toEqual("ffi fi ffl ff fl \x07 \x08 Ý");
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets empty structure tree", async function () {
|
||||
const tree = await page.getStructTree();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user