Remove the invisible format marks from the text chunks

- it aims to fix issue #9186.
This commit is contained in:
Calixte Denizet 2022-01-23 23:04:18 +01:00
parent 88236e1163
commit e1d3a3b414
7 changed files with 99 additions and 14 deletions

View File

@ -2561,6 +2561,9 @@ class PartialEvaluator {
for (let i = 0, ii = glyphs.length; i < ii; i++) {
const glyph = glyphs[i];
if (glyph.isInvisibleFormatMark) {
continue;
}
let charSpacing =
textState.charSpacing + (i + 1 === ii ? extraSpacing : 0);
@ -2601,7 +2604,7 @@ class PartialEvaluator {
// Must be called after compareWithLastPosition because
// the textContentItem could have been flushed.
const textChunk = ensureTextContentItem();
if (glyph.isDiacritic) {
if (glyph.isZeroWidthDiacritic) {
scaledDim = 0;
}

View File

@ -216,7 +216,8 @@ class Glyph {
const category = getCharUnicodeCategory(unicode);
this.isWhitespace = category.isWhitespace;
this.isDiacritic = category.isDiacritic;
this.isZeroWidthDiacritic = category.isZeroWidthDiacritic;
this.isInvisibleFormatMark = category.isInvisibleFormatMark;
}
matchesForCache(

View File

@ -1640,12 +1640,13 @@ function reverseIfRtl(chars) {
return buf.join("");
}
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})$", "u");
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u");
function getCharUnicodeCategory(char) {
const groups = char.match(SpecialCharRegExp);
return {
isWhitespace: !!(groups && groups[1]),
isDiacritic: !!(groups && groups[2]),
isZeroWidthDiacritic: !!(groups && groups[2]),
isInvisibleFormatMark: !!(groups && groups[3]),
};
}

View File

@ -0,0 +1 @@
https://github.com/mozilla/pdf.js/files/1500985/Sample.lease.contract.26.pdf

View File

@ -6230,5 +6230,13 @@
"firstPage": 1,
"lastPage": 1,
"type": "eq"
},
{ "id": "issue9186",
"file": "pdfs/issue9186.pdf",
"md5": "d151857bb724ab9a291704a45a0b5d7f",
"rounds": 1,
"link": true,
"lastPage": 1,
"type": "text"
}
]

View File

@ -2147,6 +2147,29 @@ sozialökonomische Gerechtigkeit.`)
await loadingTask.destroy();
});
it("gets text content, with invisible text marks (issue 9186)", async function () {
if (isNodeJS) {
pending("Linked test-cases are not supported in Node.js.");
}
const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);
expect(
text.includes(`This Agreement (“Agreement”) is made as of this 25th day of January, 2017, by and
between EDWARD G. ATSINGER III, not individually but as sole Trustee of the ATSINGER
FAMILY TRUST /u/a dated October 31, 1980 as amended, and STUART W. EPPERSON, not
individually but solely as Trustee of the STUART W. EPPERSON REVOCABLE LIVING
TRUST /u/a dated January 14th 1993 as amended, collectively referred to herein as Lessor, and
Caron Broadcasting, Inc., an Ohio corporation (Lessee).`)
).toEqual(true);
await loadingTask.destroy();
});
it("gets text content, with beginbfrange operator handled correctly (bug 1627427)", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("bug1627427_reduced.pdf")

View File

@ -47,19 +47,67 @@ describe("unicode", function () {
it("should correctly determine the character category", function () {
const tests = {
// Whitespace
" ": { isDiacritic: false, isWhitespace: true },
"\t": { isDiacritic: false, isWhitespace: true },
"\u2001": { isDiacritic: false, isWhitespace: true },
"\uFEFF": { isDiacritic: false, isWhitespace: true },
" ": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: true,
},
"\t": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: true,
},
"\u2001": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: true,
},
"\uFEFF": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: true,
},
// Diacritic
"\u0302": { isDiacritic: true, isWhitespace: false },
"\u0344": { isDiacritic: true, isWhitespace: false },
"\u0361": { isDiacritic: true, isWhitespace: false },
"\u0302": {
isZeroWidthDiacritic: true,
isInvisibleFormatMark: false,
isWhitespace: false,
},
"\u0344": {
isZeroWidthDiacritic: true,
isInvisibleFormatMark: false,
isWhitespace: false,
},
"\u0361": {
isZeroWidthDiacritic: true,
isInvisibleFormatMark: false,
isWhitespace: false,
},
// No whitespace or diacritic
a: { isDiacritic: false, isWhitespace: false },
1: { isDiacritic: false, isWhitespace: false },
// Invisible format mark
"\u200B": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: true,
isWhitespace: false,
},
"\u200D": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: true,
isWhitespace: false,
},
// No whitespace or diacritic or invisible format mark
a: {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: false,
},
1: {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: false,
},
};
for (const [character, expectation] of Object.entries(tests)) {
expect(getCharUnicodeCategory(character)).toEqual(expectation);