Merge pull request from calixteman/15690

Normalize fullwidth, halfwidth and circled chars when searching
This commit is contained in:
calixteman 2022-11-14 21:36:29 +01:00 committed by GitHub
commit 859335a1ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 60 additions and 15 deletions

@ -554,3 +554,4 @@
!bug1796741.pdf !bug1796741.pdf
!textfields.pdf !textfields.pdf
!freetext_no_appearance.pdf !freetext_no_appearance.pdf
!issue15690.pdf

BIN
test/pdfs/issue15690.pdf Executable file

Binary file not shown.

@ -647,4 +647,25 @@ describe("pdf_find_controller", function () {
pageMatchesLength: [[4]], pageMatchesLength: [[4]],
}); });
}); });
it("performs a search in a text containing fullwidth chars", async function () {
const { eventBus, pdfFindController } = await initPdfFindController(
"issue15690.pdf"
);
await testSearch({
eventBus,
pdfFindController,
state: {
query: "o",
},
matchesPerPage: [13],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[0, 10, 13, 30, 39, 41, 55, 60, 66, 84, 102, 117, 134]],
pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
});
});
}); });

@ -95,6 +95,8 @@ const SYLLABLES_LENGTHS = new Map();
const FIRST_CHAR_SYLLABLES_REG_EXP = const FIRST_CHAR_SYLLABLES_REG_EXP =
"[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]"; "[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]";
const NFKC_CHARS_TO_NORMALIZE = new Map();
let noSyllablesRegExp = null; let noSyllablesRegExp = null;
let withSyllablesRegExp = null; let withSyllablesRegExp = null;
@ -126,7 +128,13 @@ function normalize(text) {
} else { } else {
// Compile the regular expression for text normalization once. // Compile the regular expression for text normalization once.
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; const toNormalizeWithNFKC =
"\u2460-\u2473" + // Circled numbers.
"\u24b6-\u24ff" + // Circled letters/numbers.
"\u3244-\u32bf" + // Circled ideograms/numbers.
"\u32d0-\u32fe" + // Circled ideograms.
"\uff00-\uffef"; // Halfwidth, fullwidth forms.
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`;
if (syllablePositions.length === 0) { if (syllablePositions.length === 0) {
// Most of the syllables belong to Hangul so there are no need // Most of the syllables belong to Hangul so there are no need
@ -188,11 +196,11 @@ function normalize(text) {
normalized = normalized.replace( normalized = normalized.replace(
normalizationRegex, normalizationRegex,
(match, p1, p2, p3, p4, p5, p6, i) => { (match, p1, p2, p3, p4, p5, p6, p7, i) => {
i -= shiftOrigin; i -= shiftOrigin;
if (p1) { if (p1) {
// Maybe fractions or quotations mark... // Maybe fractions or quotations mark...
const replacement = CHARACTERS_TO_NORMALIZE[match]; const replacement = CHARACTERS_TO_NORMALIZE[p1];
const jj = replacement.length; const jj = replacement.length;
for (let j = 1; j < jj; j++) { for (let j = 1; j < jj; j++) {
positions.push([i - shift + j, shift - j]); positions.push([i - shift + j, shift - j]);
@ -202,8 +210,23 @@ function normalize(text) {
} }
if (p2) { if (p2) {
const hasTrailingDashEOL = p2.endsWith("\n"); // Use the NFKC representation to normalize the char.
const len = hasTrailingDashEOL ? p2.length - 2 : p2.length; let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2);
if (!replacement) {
replacement = p2.normalize("NFKC");
NFKC_CHARS_TO_NORMALIZE.set(p2, replacement);
}
const jj = replacement.length;
for (let j = 1; j < jj; j++) {
positions.push([i - shift + j, shift - j]);
}
shift -= jj - 1;
return replacement;
}
if (p3) {
const hasTrailingDashEOL = p3.endsWith("\n");
const len = hasTrailingDashEOL ? p3.length - 2 : p3.length;
// Diacritics. // Diacritics.
hasDiacritics = true; hasDiacritics = true;
@ -223,19 +246,19 @@ function normalize(text) {
if (hasTrailingDashEOL) { if (hasTrailingDashEOL) {
// Diacritics are followed by a -\n. // Diacritics are followed by a -\n.
// See comments in `if (p3)` block. // See comments in `if (p4)` block.
i += len - 1; i += len - 1;
positions.push([i - shift + 1, 1 + shift]); positions.push([i - shift + 1, 1 + shift]);
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p2.slice(0, len); return p3.slice(0, len);
} }
return p2; return p3;
} }
if (p3) { if (p4) {
// "X-\n" is removed because an hyphen at the end of a line // "X-\n" is removed because an hyphen at the end of a line
// with not a space before is likely here to mark a break // with not a space before is likely here to mark a break
// in a word. // in a word.
@ -244,19 +267,19 @@ function normalize(text) {
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p3.charAt(0); return p4.charAt(0);
} }
if (p4) { if (p5) {
// An ideographic at the end of a line doesn't imply adding an extra // An ideographic at the end of a line doesn't imply adding an extra
// white space. // white space.
positions.push([i - shift + 1, shift]); positions.push([i - shift + 1, shift]);
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p4.charAt(0); return p5.charAt(0);
} }
if (p5) { if (p6) {
// eol is replaced by space: "foo\nbar" is likely equivalent to // eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar". // "foo bar".
positions.push([i - shift + 1, shift - 1]); positions.push([i - shift + 1, shift - 1]);
@ -266,7 +289,7 @@ function normalize(text) {
return " "; return " ";
} }
// p6 // p7
if (i + eol === syllablePositions[syllableIndex]?.[1]) { if (i + eol === syllablePositions[syllableIndex]?.[1]) {
// A syllable (1 char) is replaced with several chars (n) so // A syllable (1 char) is replaced with several chars (n) so
// newCharsLen = n - 1. // newCharsLen = n - 1.
@ -278,7 +301,7 @@ function normalize(text) {
shift -= newCharLen; shift -= newCharLen;
shiftOrigin += newCharLen; shiftOrigin += newCharLen;
} }
return p6; return p7;
} }
); );