Merge pull request #15395 from calixteman/15340

Don't replace cr by a white space when the last char on the line is an ideographic char
This commit is contained in:
calixteman 2022-09-04 14:30:22 +02:00 committed by GitHub
commit 72375ed652
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 35 additions and 4 deletions

View File

@ -538,3 +538,4 @@
!bug1782186.pdf
!tracemonkey_a11y.pdf
!bug1782564.pdf
!issue15340.pdf

BIN
test/pdfs/issue15340.pdf Normal file

Binary file not shown.

View File

@ -626,4 +626,25 @@ describe("pdf_find_controller", function () {
pageMatchesLength: [[8]],
});
});
it("performs a search in a text containing an ideographic at the end of a line", async function () {
const { eventBus, pdfFindController } = await initPdfFindController(
"issue15340.pdf"
);
await testSearch({
eventBus,
pdfFindController,
state: {
query: "検知機構",
},
matchesPerPage: [1],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[29]],
pageMatchesLength: [[4]],
});
});
});

View File

@ -126,7 +126,7 @@ function normalize(text) {
} else {
// Compile the regular expression for text normalization once.
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`;
const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`;
if (syllablePositions.length === 0) {
// Most of the syllables belong to Hangul so there are no need
@ -188,7 +188,7 @@ function normalize(text) {
normalized = normalized.replace(
normalizationRegex,
(match, p1, p2, p3, p4, p5, i) => {
(match, p1, p2, p3, p4, p5, p6, i) => {
i -= shiftOrigin;
if (p1) {
// Maybe fractions or quotations mark...
@ -248,6 +248,15 @@ function normalize(text) {
}
if (p4) {
// An ideographic at the end of a line doesn't imply adding an extra
// white space.
positions.push([i - shift + 1, shift]);
shiftOrigin += 1;
eol += 1;
return p4.charAt(0);
}
if (p5) {
// eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar".
positions.push([i - shift + 1, shift - 1]);
@ -257,7 +266,7 @@ function normalize(text) {
return " ";
}
// p5
// p6
if (i + eol === syllablePositions[syllableIndex]?.[1]) {
// A syllable (1 char) is replaced with several chars (n) so
// newCharsLen = n - 1.
@ -269,7 +278,7 @@ function normalize(text) {
shift -= newCharLen;
shiftOrigin += newCharLen;
}
return p5;
return p6;
}
);