Merge pull request #14483 from calixteman/200B

Remove the invisible format marks from the text chunks
This commit is contained in:
calixteman 2022-01-24 17:52:06 +01:00 committed by GitHub
commit 9367d54009
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 99 additions and 14 deletions

View File

@ -2561,6 +2561,9 @@ class PartialEvaluator {
for (let i = 0, ii = glyphs.length; i < ii; i++) {
const glyph = glyphs[i];
if (glyph.isInvisibleFormatMark) {
continue;
}
let charSpacing =
textState.charSpacing + (i + 1 === ii ? extraSpacing : 0);
@ -2601,7 +2604,7 @@ class PartialEvaluator {
// Must be called after compareWithLastPosition because
// the textContentItem could have been flushed.
const textChunk = ensureTextContentItem();
if (glyph.isDiacritic) {
if (glyph.isZeroWidthDiacritic) {
scaledDim = 0;
}

View File

@ -216,7 +216,8 @@ class Glyph {
const category = getCharUnicodeCategory(unicode);
this.isWhitespace = category.isWhitespace;
this.isDiacritic = category.isDiacritic;
this.isZeroWidthDiacritic = category.isZeroWidthDiacritic;
this.isInvisibleFormatMark = category.isInvisibleFormatMark;
}
matchesForCache(

View File

@ -1640,12 +1640,13 @@ function reverseIfRtl(chars) {
return buf.join("");
}
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})$", "u");
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u");
function getCharUnicodeCategory(char) {
const groups = char.match(SpecialCharRegExp);
return {
isWhitespace: !!(groups && groups[1]),
isDiacritic: !!(groups && groups[2]),
isZeroWidthDiacritic: !!(groups && groups[2]),
isInvisibleFormatMark: !!(groups && groups[3]),
};
}

View File

@ -0,0 +1 @@
https://github.com/mozilla/pdf.js/files/1500985/Sample.lease.contract.26.pdf

View File

@ -6230,5 +6230,13 @@
"firstPage": 1,
"lastPage": 1,
"type": "eq"
},
{ "id": "issue9186",
"file": "pdfs/issue9186.pdf",
"md5": "d151857bb724ab9a291704a45a0b5d7f",
"rounds": 1,
"link": true,
"lastPage": 1,
"type": "text"
}
]

View File

@ -2147,6 +2147,29 @@ sozialökonomische Gerechtigkeit.`)
await loadingTask.destroy();
});
it("gets text content, with invisible text marks (issue 9186)", async function () {
if (isNodeJS) {
pending("Linked test-cases are not supported in Node.js.");
}
const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);
expect(
text.includes(`This Agreement (“Agreement”) is made as of this 25th day of January, 2017, by and
between EDWARD G. ATSINGER III, not individually but as sole Trustee of the ATSINGER
FAMILY TRUST /u/a dated October 31, 1980 as amended, and STUART W. EPPERSON, not
individually but solely as Trustee of the STUART W. EPPERSON REVOCABLE LIVING
TRUST /u/a dated January 14th 1993 as amended, collectively referred to herein as Lessor, and
Caron Broadcasting, Inc., an Ohio corporation (Lessee).`)
).toEqual(true);
await loadingTask.destroy();
});
it("gets text content, with beginbfrange operator handled correctly (bug 1627427)", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("bug1627427_reduced.pdf")

View File

@ -47,19 +47,67 @@ describe("unicode", function () {
it("should correctly determine the character category", function () {
const tests = {
// Whitespace
" ": { isDiacritic: false, isWhitespace: true },
"\t": { isDiacritic: false, isWhitespace: true },
"\u2001": { isDiacritic: false, isWhitespace: true },
"\uFEFF": { isDiacritic: false, isWhitespace: true },
" ": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: true,
},
"\t": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: true,
},
"\u2001": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: true,
},
"\uFEFF": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: true,
},
// Diacritic
"\u0302": { isDiacritic: true, isWhitespace: false },
"\u0344": { isDiacritic: true, isWhitespace: false },
"\u0361": { isDiacritic: true, isWhitespace: false },
"\u0302": {
isZeroWidthDiacritic: true,
isInvisibleFormatMark: false,
isWhitespace: false,
},
"\u0344": {
isZeroWidthDiacritic: true,
isInvisibleFormatMark: false,
isWhitespace: false,
},
"\u0361": {
isZeroWidthDiacritic: true,
isInvisibleFormatMark: false,
isWhitespace: false,
},
// No whitespace or diacritic
a: { isDiacritic: false, isWhitespace: false },
1: { isDiacritic: false, isWhitespace: false },
// Invisible format mark
"\u200B": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: true,
isWhitespace: false,
},
"\u200D": {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: true,
isWhitespace: false,
},
// No whitespace or diacritic or invisible format mark
a: {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: false,
},
1: {
isZeroWidthDiacritic: false,
isInvisibleFormatMark: false,
isWhitespace: false,
},
};
for (const [character, expectation] of Object.entries(tests)) {
expect(getCharUnicodeCategory(character)).toEqual(expectation);