Don't replace an eol by a whitespace when the last char is a Katakana-Hiragana diacritic

This commit is contained in:
Calixte Denizet 2023-02-16 11:25:15 +01:00
parent 546902df63
commit fc7d74385f
4 changed files with 76 additions and 14 deletions

View File

@ -574,3 +574,4 @@
!bug1815476.pdf !bug1815476.pdf
!issue16021.pdf !issue16021.pdf
!bug1770750.pdf !bug1770750.pdf
!issue16063.pdf

BIN
test/pdfs/issue16063.pdf Executable file

Binary file not shown.

View File

@ -818,4 +818,40 @@ describe("pdf_find_controller", function () {
}, },
}); });
}); });
it("performs a search in a text with some Hiragana diacritics at the end of a line", async function () {
const { eventBus, pdfFindController } = await initPdfFindController(
"issue16063.pdf"
);
await testSearch({
eventBus,
pdfFindController,
state: {
query: "行うことができる速結端子",
},
matchesPerPage: [1],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[63]],
pageMatchesLength: [[12]],
});
await testSearch({
eventBus,
pdfFindController,
state: {
query: "デュプレックス",
},
matchesPerPage: [1],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[205]],
pageMatchesLength: [[7]],
});
});
}); });

View File

@ -136,7 +136,8 @@ function normalize(text) {
// 3040-309F: Hiragana // 3040-309F: Hiragana
// 30A0-30FF: Katakana // 30A0-30FF: Katakana
const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])"; const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`; const HKDiacritics = "(?:\u3099|\u309A)";
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
if (syllablePositions.length === 0) { if (syllablePositions.length === 0) {
// Most of the syllables belong to Hangul so there are no need // Most of the syllables belong to Hangul so there are no need
@ -198,7 +199,7 @@ function normalize(text) {
normalized = normalized.replace( normalized = normalized.replace(
normalizationRegex, normalizationRegex,
(match, p1, p2, p3, p4, p5, p6, p7, i) => { (match, p1, p2, p3, p4, p5, p6, p7, p8, i) => {
i -= shiftOrigin; i -= shiftOrigin;
if (p1) { if (p1) {
// Maybe fractions or quotations mark... // Maybe fractions or quotations mark...
@ -227,8 +228,32 @@ function normalize(text) {
} }
if (p3) { if (p3) {
const hasTrailingDashEOL = p3.endsWith("\n"); // We've a Katakana-Hiragana diacritic followed by a \n so don't replace
const len = hasTrailingDashEOL ? p3.length - 2 : p3.length; // the \n by a whitespace.
hasDiacritics = true;
// Diacritic.
if (i + eol === rawDiacriticsPositions[rawDiacriticsIndex]?.[1]) {
++rawDiacriticsIndex;
} else {
// i is the position of the first diacritic
// so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + 1, shift - 1]);
shift -= 1;
shiftOrigin += 1;
}
// End-of-line.
positions.push([i - shift + 1, shift]);
shiftOrigin += 1;
eol += 1;
return p3.charAt(0);
}
if (p4) {
const hasTrailingDashEOL = p4.endsWith("\n");
const len = hasTrailingDashEOL ? p4.length - 2 : p4.length;
// Diacritics. // Diacritics.
hasDiacritics = true; hasDiacritics = true;
@ -248,19 +273,19 @@ function normalize(text) {
if (hasTrailingDashEOL) { if (hasTrailingDashEOL) {
// Diacritics are followed by a -\n. // Diacritics are followed by a -\n.
// See comments in `if (p4)` block. // See comments in `if (p5)` block.
i += len - 1; i += len - 1;
positions.push([i - shift + 1, 1 + shift]); positions.push([i - shift + 1, 1 + shift]);
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p3.slice(0, len); return p4.slice(0, len);
} }
return p3; return p4;
} }
if (p4) { if (p5) {
// "X-\n" is removed because an hyphen at the end of a line // "X-\n" is removed because an hyphen at the end of a line
// with not a space before is likely here to mark a break // with not a space before is likely here to mark a break
// in a word. // in a word.
@ -269,19 +294,19 @@ function normalize(text) {
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p4.charAt(0); return p5.charAt(0);
} }
if (p5) { if (p6) {
// An ideographic at the end of a line doesn't imply adding an extra // An ideographic at the end of a line doesn't imply adding an extra
// white space. // white space.
positions.push([i - shift + 1, shift]); positions.push([i - shift + 1, shift]);
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p5.charAt(0); return p6.charAt(0);
} }
if (p6) { if (p7) {
// eol is replaced by space: "foo\nbar" is likely equivalent to // eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar". // "foo bar".
positions.push([i - shift + 1, shift - 1]); positions.push([i - shift + 1, shift - 1]);
@ -291,7 +316,7 @@ function normalize(text) {
return " "; return " ";
} }
// p7 // p8
if (i + eol === syllablePositions[syllableIndex]?.[1]) { if (i + eol === syllablePositions[syllableIndex]?.[1]) {
// A syllable (1 char) is replaced with several chars (n) so // A syllable (1 char) is replaced with several chars (n) so
// newCharsLen = n - 1. // newCharsLen = n - 1.
@ -303,7 +328,7 @@ function normalize(text) {
shift -= newCharLen; shift -= newCharLen;
shiftOrigin += newCharLen; shiftOrigin += newCharLen;
} }
return p7; return p8;
} }
); );