Remove the invisible format marks from the text chunks
- it aims to fix issue #9186.
This commit is contained in:
parent
88236e1163
commit
e1d3a3b414
@ -2561,6 +2561,9 @@ class PartialEvaluator {
|
||||
|
||||
for (let i = 0, ii = glyphs.length; i < ii; i++) {
|
||||
const glyph = glyphs[i];
|
||||
if (glyph.isInvisibleFormatMark) {
|
||||
continue;
|
||||
}
|
||||
let charSpacing =
|
||||
textState.charSpacing + (i + 1 === ii ? extraSpacing : 0);
|
||||
|
||||
@ -2601,7 +2604,7 @@ class PartialEvaluator {
|
||||
// Must be called after compareWithLastPosition because
|
||||
// the textContentItem could have been flushed.
|
||||
const textChunk = ensureTextContentItem();
|
||||
if (glyph.isDiacritic) {
|
||||
if (glyph.isZeroWidthDiacritic) {
|
||||
scaledDim = 0;
|
||||
}
|
||||
|
||||
|
@ -216,7 +216,8 @@ class Glyph {
|
||||
|
||||
const category = getCharUnicodeCategory(unicode);
|
||||
this.isWhitespace = category.isWhitespace;
|
||||
this.isDiacritic = category.isDiacritic;
|
||||
this.isZeroWidthDiacritic = category.isZeroWidthDiacritic;
|
||||
this.isInvisibleFormatMark = category.isInvisibleFormatMark;
|
||||
}
|
||||
|
||||
matchesForCache(
|
||||
|
@ -1640,12 +1640,13 @@ function reverseIfRtl(chars) {
|
||||
return buf.join("");
|
||||
}
|
||||
|
||||
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})$", "u");
|
||||
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u");
|
||||
function getCharUnicodeCategory(char) {
|
||||
const groups = char.match(SpecialCharRegExp);
|
||||
return {
|
||||
isWhitespace: !!(groups && groups[1]),
|
||||
isDiacritic: !!(groups && groups[2]),
|
||||
isZeroWidthDiacritic: !!(groups && groups[2]),
|
||||
isInvisibleFormatMark: !!(groups && groups[3]),
|
||||
};
|
||||
}
|
||||
|
||||
|
1
test/pdfs/issue9186.pdf.link
Normal file
1
test/pdfs/issue9186.pdf.link
Normal file
@ -0,0 +1 @@
|
||||
https://github.com/mozilla/pdf.js/files/1500985/Sample.lease.contract.26.pdf
|
@ -6230,5 +6230,13 @@
|
||||
"firstPage": 1,
|
||||
"lastPage": 1,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "issue9186",
|
||||
"file": "pdfs/issue9186.pdf",
|
||||
"md5": "d151857bb724ab9a291704a45a0b5d7f",
|
||||
"rounds": 1,
|
||||
"link": true,
|
||||
"lastPage": 1,
|
||||
"type": "text"
|
||||
}
|
||||
]
|
||||
|
@ -2147,6 +2147,29 @@ sozialökonomische Gerechtigkeit.`)
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets text content, with invisible text marks (issue 9186)", async function () {
|
||||
if (isNodeJS) {
|
||||
pending("Linked test-cases are not supported in Node.js.");
|
||||
}
|
||||
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
text.includes(`This Agreement (“Agreement”) is made as of this 25th day of January, 2017, by and
|
||||
between EDWARD G. ATSINGER III, not individually but as sole Trustee of the ATSINGER
|
||||
FAMILY TRUST /u/a dated October 31, 1980 as amended, and STUART W. EPPERSON, not
|
||||
individually but solely as Trustee of the STUART W. EPPERSON REVOCABLE LIVING
|
||||
TRUST /u/a dated January 14th 1993 as amended, collectively referred to herein as “Lessor”, and
|
||||
Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||
).toEqual(true);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets text content, with beginbfrange operator handled correctly (bug 1627427)", async function () {
|
||||
const loadingTask = getDocument(
|
||||
buildGetDocumentParams("bug1627427_reduced.pdf")
|
||||
|
@ -47,19 +47,67 @@ describe("unicode", function () {
|
||||
it("should correctly determine the character category", function () {
|
||||
const tests = {
|
||||
// Whitespace
|
||||
" ": { isDiacritic: false, isWhitespace: true },
|
||||
"\t": { isDiacritic: false, isWhitespace: true },
|
||||
"\u2001": { isDiacritic: false, isWhitespace: true },
|
||||
"\uFEFF": { isDiacritic: false, isWhitespace: true },
|
||||
" ": {
|
||||
isZeroWidthDiacritic: false,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: true,
|
||||
},
|
||||
"\t": {
|
||||
isZeroWidthDiacritic: false,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: true,
|
||||
},
|
||||
"\u2001": {
|
||||
isZeroWidthDiacritic: false,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: true,
|
||||
},
|
||||
"\uFEFF": {
|
||||
isZeroWidthDiacritic: false,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: true,
|
||||
},
|
||||
|
||||
// Diacritic
|
||||
"\u0302": { isDiacritic: true, isWhitespace: false },
|
||||
"\u0344": { isDiacritic: true, isWhitespace: false },
|
||||
"\u0361": { isDiacritic: true, isWhitespace: false },
|
||||
"\u0302": {
|
||||
isZeroWidthDiacritic: true,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: false,
|
||||
},
|
||||
"\u0344": {
|
||||
isZeroWidthDiacritic: true,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: false,
|
||||
},
|
||||
"\u0361": {
|
||||
isZeroWidthDiacritic: true,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: false,
|
||||
},
|
||||
|
||||
// No whitespace or diacritic
|
||||
a: { isDiacritic: false, isWhitespace: false },
|
||||
1: { isDiacritic: false, isWhitespace: false },
|
||||
// Invisible format mark
|
||||
"\u200B": {
|
||||
isZeroWidthDiacritic: false,
|
||||
isInvisibleFormatMark: true,
|
||||
isWhitespace: false,
|
||||
},
|
||||
"\u200D": {
|
||||
isZeroWidthDiacritic: false,
|
||||
isInvisibleFormatMark: true,
|
||||
isWhitespace: false,
|
||||
},
|
||||
|
||||
// No whitespace or diacritic or invisible format mark
|
||||
a: {
|
||||
isZeroWidthDiacritic: false,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: false,
|
||||
},
|
||||
1: {
|
||||
isZeroWidthDiacritic: false,
|
||||
isInvisibleFormatMark: false,
|
||||
isWhitespace: false,
|
||||
},
|
||||
};
|
||||
for (const [character, expectation] of Object.entries(tests)) {
|
||||
expect(getCharUnicodeCategory(character)).toEqual(expectation);
|
||||
|
Loading…
Reference in New Issue
Block a user