Remove the invisible format marks from the text chunks
- it aims to fix issue #9186.
This commit is contained in:
parent
88236e1163
commit
e1d3a3b414
@ -2561,6 +2561,9 @@ class PartialEvaluator {
|
|||||||
|
|
||||||
for (let i = 0, ii = glyphs.length; i < ii; i++) {
|
for (let i = 0, ii = glyphs.length; i < ii; i++) {
|
||||||
const glyph = glyphs[i];
|
const glyph = glyphs[i];
|
||||||
|
if (glyph.isInvisibleFormatMark) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
let charSpacing =
|
let charSpacing =
|
||||||
textState.charSpacing + (i + 1 === ii ? extraSpacing : 0);
|
textState.charSpacing + (i + 1 === ii ? extraSpacing : 0);
|
||||||
|
|
||||||
@ -2601,7 +2604,7 @@ class PartialEvaluator {
|
|||||||
// Must be called after compareWithLastPosition because
|
// Must be called after compareWithLastPosition because
|
||||||
// the textContentItem could have been flushed.
|
// the textContentItem could have been flushed.
|
||||||
const textChunk = ensureTextContentItem();
|
const textChunk = ensureTextContentItem();
|
||||||
if (glyph.isDiacritic) {
|
if (glyph.isZeroWidthDiacritic) {
|
||||||
scaledDim = 0;
|
scaledDim = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -216,7 +216,8 @@ class Glyph {
|
|||||||
|
|
||||||
const category = getCharUnicodeCategory(unicode);
|
const category = getCharUnicodeCategory(unicode);
|
||||||
this.isWhitespace = category.isWhitespace;
|
this.isWhitespace = category.isWhitespace;
|
||||||
this.isDiacritic = category.isDiacritic;
|
this.isZeroWidthDiacritic = category.isZeroWidthDiacritic;
|
||||||
|
this.isInvisibleFormatMark = category.isInvisibleFormatMark;
|
||||||
}
|
}
|
||||||
|
|
||||||
matchesForCache(
|
matchesForCache(
|
||||||
|
@ -1640,12 +1640,13 @@ function reverseIfRtl(chars) {
|
|||||||
return buf.join("");
|
return buf.join("");
|
||||||
}
|
}
|
||||||
|
|
||||||
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})$", "u");
|
const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u");
|
||||||
function getCharUnicodeCategory(char) {
|
function getCharUnicodeCategory(char) {
|
||||||
const groups = char.match(SpecialCharRegExp);
|
const groups = char.match(SpecialCharRegExp);
|
||||||
return {
|
return {
|
||||||
isWhitespace: !!(groups && groups[1]),
|
isWhitespace: !!(groups && groups[1]),
|
||||||
isDiacritic: !!(groups && groups[2]),
|
isZeroWidthDiacritic: !!(groups && groups[2]),
|
||||||
|
isInvisibleFormatMark: !!(groups && groups[3]),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
1
test/pdfs/issue9186.pdf.link
Normal file
1
test/pdfs/issue9186.pdf.link
Normal file
@ -0,0 +1 @@
|
|||||||
|
https://github.com/mozilla/pdf.js/files/1500985/Sample.lease.contract.26.pdf
|
@ -6230,5 +6230,13 @@
|
|||||||
"firstPage": 1,
|
"firstPage": 1,
|
||||||
"lastPage": 1,
|
"lastPage": 1,
|
||||||
"type": "eq"
|
"type": "eq"
|
||||||
|
},
|
||||||
|
{ "id": "issue9186",
|
||||||
|
"file": "pdfs/issue9186.pdf",
|
||||||
|
"md5": "d151857bb724ab9a291704a45a0b5d7f",
|
||||||
|
"rounds": 1,
|
||||||
|
"link": true,
|
||||||
|
"lastPage": 1,
|
||||||
|
"type": "text"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -2147,6 +2147,29 @@ sozialökonomische Gerechtigkeit.`)
|
|||||||
await loadingTask.destroy();
|
await loadingTask.destroy();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("gets text content, with invisible text marks (issue 9186)", async function () {
|
||||||
|
if (isNodeJS) {
|
||||||
|
pending("Linked test-cases are not supported in Node.js.");
|
||||||
|
}
|
||||||
|
|
||||||
|
const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf"));
|
||||||
|
const pdfDoc = await loadingTask.promise;
|
||||||
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
|
const { items } = await pdfPage.getTextContent();
|
||||||
|
const text = mergeText(items);
|
||||||
|
|
||||||
|
expect(
|
||||||
|
text.includes(`This Agreement (“Agreement”) is made as of this 25th day of January, 2017, by and
|
||||||
|
between EDWARD G. ATSINGER III, not individually but as sole Trustee of the ATSINGER
|
||||||
|
FAMILY TRUST /u/a dated October 31, 1980 as amended, and STUART W. EPPERSON, not
|
||||||
|
individually but solely as Trustee of the STUART W. EPPERSON REVOCABLE LIVING
|
||||||
|
TRUST /u/a dated January 14th 1993 as amended, collectively referred to herein as “Lessor”, and
|
||||||
|
Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||||
|
).toEqual(true);
|
||||||
|
|
||||||
|
await loadingTask.destroy();
|
||||||
|
});
|
||||||
|
|
||||||
it("gets text content, with beginbfrange operator handled correctly (bug 1627427)", async function () {
|
it("gets text content, with beginbfrange operator handled correctly (bug 1627427)", async function () {
|
||||||
const loadingTask = getDocument(
|
const loadingTask = getDocument(
|
||||||
buildGetDocumentParams("bug1627427_reduced.pdf")
|
buildGetDocumentParams("bug1627427_reduced.pdf")
|
||||||
|
@ -47,19 +47,67 @@ describe("unicode", function () {
|
|||||||
it("should correctly determine the character category", function () {
|
it("should correctly determine the character category", function () {
|
||||||
const tests = {
|
const tests = {
|
||||||
// Whitespace
|
// Whitespace
|
||||||
" ": { isDiacritic: false, isWhitespace: true },
|
" ": {
|
||||||
"\t": { isDiacritic: false, isWhitespace: true },
|
isZeroWidthDiacritic: false,
|
||||||
"\u2001": { isDiacritic: false, isWhitespace: true },
|
isInvisibleFormatMark: false,
|
||||||
"\uFEFF": { isDiacritic: false, isWhitespace: true },
|
isWhitespace: true,
|
||||||
|
},
|
||||||
|
"\t": {
|
||||||
|
isZeroWidthDiacritic: false,
|
||||||
|
isInvisibleFormatMark: false,
|
||||||
|
isWhitespace: true,
|
||||||
|
},
|
||||||
|
"\u2001": {
|
||||||
|
isZeroWidthDiacritic: false,
|
||||||
|
isInvisibleFormatMark: false,
|
||||||
|
isWhitespace: true,
|
||||||
|
},
|
||||||
|
"\uFEFF": {
|
||||||
|
isZeroWidthDiacritic: false,
|
||||||
|
isInvisibleFormatMark: false,
|
||||||
|
isWhitespace: true,
|
||||||
|
},
|
||||||
|
|
||||||
// Diacritic
|
// Diacritic
|
||||||
"\u0302": { isDiacritic: true, isWhitespace: false },
|
"\u0302": {
|
||||||
"\u0344": { isDiacritic: true, isWhitespace: false },
|
isZeroWidthDiacritic: true,
|
||||||
"\u0361": { isDiacritic: true, isWhitespace: false },
|
isInvisibleFormatMark: false,
|
||||||
|
isWhitespace: false,
|
||||||
|
},
|
||||||
|
"\u0344": {
|
||||||
|
isZeroWidthDiacritic: true,
|
||||||
|
isInvisibleFormatMark: false,
|
||||||
|
isWhitespace: false,
|
||||||
|
},
|
||||||
|
"\u0361": {
|
||||||
|
isZeroWidthDiacritic: true,
|
||||||
|
isInvisibleFormatMark: false,
|
||||||
|
isWhitespace: false,
|
||||||
|
},
|
||||||
|
|
||||||
// No whitespace or diacritic
|
// Invisible format mark
|
||||||
a: { isDiacritic: false, isWhitespace: false },
|
"\u200B": {
|
||||||
1: { isDiacritic: false, isWhitespace: false },
|
isZeroWidthDiacritic: false,
|
||||||
|
isInvisibleFormatMark: true,
|
||||||
|
isWhitespace: false,
|
||||||
|
},
|
||||||
|
"\u200D": {
|
||||||
|
isZeroWidthDiacritic: false,
|
||||||
|
isInvisibleFormatMark: true,
|
||||||
|
isWhitespace: false,
|
||||||
|
},
|
||||||
|
|
||||||
|
// No whitespace or diacritic or invisible format mark
|
||||||
|
a: {
|
||||||
|
isZeroWidthDiacritic: false,
|
||||||
|
isInvisibleFormatMark: false,
|
||||||
|
isWhitespace: false,
|
||||||
|
},
|
||||||
|
1: {
|
||||||
|
isZeroWidthDiacritic: false,
|
||||||
|
isInvisibleFormatMark: false,
|
||||||
|
isWhitespace: false,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
for (const [character, expectation] of Object.entries(tests)) {
|
for (const [character, expectation] of Object.entries(tests)) {
|
||||||
expect(getCharUnicodeCategory(character)).toEqual(expectation);
|
expect(getCharUnicodeCategory(character)).toEqual(expectation);
|
||||||
|
Loading…
Reference in New Issue
Block a user