diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 13b82ec30..4838a29ea 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2561,6 +2561,9 @@ class PartialEvaluator { for (let i = 0, ii = glyphs.length; i < ii; i++) { const glyph = glyphs[i]; + if (glyph.isInvisibleFormatMark) { + continue; + } let charSpacing = textState.charSpacing + (i + 1 === ii ? extraSpacing : 0); @@ -2601,7 +2604,7 @@ class PartialEvaluator { // Must be called after compareWithLastPosition because // the textContentItem could have been flushed. const textChunk = ensureTextContentItem(); - if (glyph.isDiacritic) { + if (glyph.isZeroWidthDiacritic) { scaledDim = 0; } diff --git a/src/core/fonts.js b/src/core/fonts.js index 3e20bbc18..a1c694455 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -216,7 +216,8 @@ class Glyph { const category = getCharUnicodeCategory(unicode); this.isWhitespace = category.isWhitespace; - this.isDiacritic = category.isDiacritic; + this.isZeroWidthDiacritic = category.isZeroWidthDiacritic; + this.isInvisibleFormatMark = category.isInvisibleFormatMark; } matchesForCache( diff --git a/src/core/unicode.js b/src/core/unicode.js index 71c1c8d4f..647272834 100644 --- a/src/core/unicode.js +++ b/src/core/unicode.js @@ -1640,12 +1640,13 @@ function reverseIfRtl(chars) { return buf.join(""); } -const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})$", "u"); +const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u"); function getCharUnicodeCategory(char) { const groups = char.match(SpecialCharRegExp); return { isWhitespace: !!(groups && groups[1]), - isDiacritic: !!(groups && groups[2]), + isZeroWidthDiacritic: !!(groups && groups[2]), + isInvisibleFormatMark: !!(groups && groups[3]), }; } diff --git a/test/pdfs/issue9186.pdf.link b/test/pdfs/issue9186.pdf.link new file mode 100644 index 000000000..e8747599f --- /dev/null +++ b/test/pdfs/issue9186.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/1500985/Sample.lease.contract.26.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index 777445554..6d397afc4 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -6230,5 +6230,13 @@ "firstPage": 1, "lastPage": 1, "type": "eq" + }, + { "id": "issue9186", + "file": "pdfs/issue9186.pdf", + "md5": "d151857bb724ab9a291704a45a0b5d7f", + "rounds": 1, + "link": true, + "lastPage": 1, + "type": "text" } ] diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index cb1f61216..86c63675f 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -2147,6 +2147,29 @@ sozialökonomische Gerechtigkeit.`) await loadingTask.destroy(); }); + it("gets text content, with invisible text marks (issue 9186)", async function () { + if (isNodeJS) { + pending("Linked test-cases are not supported in Node.js."); + } + + const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + const text = mergeText(items); + + expect( + text.includes(`This Agreement (“Agreement”) is made as of this 25th day of January, 2017, by and +between EDWARD G. ATSINGER III, not individually but as sole Trustee of the ATSINGER +FAMILY TRUST /u/a dated October 31, 1980 as amended, and STUART W. EPPERSON, not +individually but solely as Trustee of the STUART W. EPPERSON REVOCABLE LIVING +TRUST /u/a dated January 14th 1993 as amended, collectively referred to herein as “Lessor”, and +Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) + ).toEqual(true); + + await loadingTask.destroy(); + }); + it("gets text content, with beginbfrange operator handled correctly (bug 1627427)", async function () { const loadingTask = getDocument( buildGetDocumentParams("bug1627427_reduced.pdf") diff --git a/test/unit/unicode_spec.js b/test/unit/unicode_spec.js index f9fd2a0cb..1753ef26b 100644 --- a/test/unit/unicode_spec.js +++ b/test/unit/unicode_spec.js @@ -47,19 +47,67 @@ describe("unicode", function () { it("should correctly determine the character category", function () { const tests = { // Whitespace - " ": { isDiacritic: false, isWhitespace: true }, - "\t": { isDiacritic: false, isWhitespace: true }, - "\u2001": { isDiacritic: false, isWhitespace: true }, - "\uFEFF": { isDiacritic: false, isWhitespace: true }, + " ": { + isZeroWidthDiacritic: false, + isInvisibleFormatMark: false, + isWhitespace: true, + }, + "\t": { + isZeroWidthDiacritic: false, + isInvisibleFormatMark: false, + isWhitespace: true, + }, + "\u2001": { + isZeroWidthDiacritic: false, + isInvisibleFormatMark: false, + isWhitespace: true, + }, + "\uFEFF": { + isZeroWidthDiacritic: false, + isInvisibleFormatMark: false, + isWhitespace: true, + }, // Diacritic - "\u0302": { isDiacritic: true, isWhitespace: false }, - "\u0344": { isDiacritic: true, isWhitespace: false }, - "\u0361": { isDiacritic: true, isWhitespace: false }, + "\u0302": { + isZeroWidthDiacritic: true, + isInvisibleFormatMark: false, + isWhitespace: false, + }, + "\u0344": { + isZeroWidthDiacritic: true, + isInvisibleFormatMark: false, + isWhitespace: false, + }, + "\u0361": { + isZeroWidthDiacritic: true, + isInvisibleFormatMark: false, + isWhitespace: false, + }, - // No whitespace or diacritic - a: { isDiacritic: false, isWhitespace: false }, - 1: { isDiacritic: false, isWhitespace: false }, + // Invisible format mark + "\u200B": { + isZeroWidthDiacritic: false, + isInvisibleFormatMark: true, + isWhitespace: false, + }, + "\u200D": { + isZeroWidthDiacritic: false, + isInvisibleFormatMark: true, + isWhitespace: false, + }, + + // No whitespace or diacritic or invisible format mark + a: { + isZeroWidthDiacritic: false, + isInvisibleFormatMark: false, + isWhitespace: false, + }, + 1: { + isZeroWidthDiacritic: false, + isInvisibleFormatMark: false, + isWhitespace: false, + }, }; for (const [character, expectation] of Object.entries(tests)) { expect(getCharUnicodeCategory(character)).toEqual(expectation);