Merge pull request #15694 from calixteman/15690
Normalize fullwidth, halfwidth and circled chars when searching
This commit is contained in:
commit
859335a1ae
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -554,3 +554,4 @@
|
|||||||
!bug1796741.pdf
|
!bug1796741.pdf
|
||||||
!textfields.pdf
|
!textfields.pdf
|
||||||
!freetext_no_appearance.pdf
|
!freetext_no_appearance.pdf
|
||||||
|
!issue15690.pdf
|
||||||
|
BIN
test/pdfs/issue15690.pdf
Executable file
BIN
test/pdfs/issue15690.pdf
Executable file
Binary file not shown.
@ -647,4 +647,25 @@ describe("pdf_find_controller", function () {
|
|||||||
pageMatchesLength: [[4]],
|
pageMatchesLength: [[4]],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("performs a search in a text containing fullwidth chars", async function () {
|
||||||
|
const { eventBus, pdfFindController } = await initPdfFindController(
|
||||||
|
"issue15690.pdf"
|
||||||
|
);
|
||||||
|
|
||||||
|
await testSearch({
|
||||||
|
eventBus,
|
||||||
|
pdfFindController,
|
||||||
|
state: {
|
||||||
|
query: "o",
|
||||||
|
},
|
||||||
|
matchesPerPage: [13],
|
||||||
|
selectedMatch: {
|
||||||
|
pageIndex: 0,
|
||||||
|
matchIndex: 0,
|
||||||
|
},
|
||||||
|
pageMatches: [[0, 10, 13, 30, 39, 41, 55, 60, 66, 84, 102, 117, 134]],
|
||||||
|
pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
@ -95,6 +95,8 @@ const SYLLABLES_LENGTHS = new Map();
|
|||||||
const FIRST_CHAR_SYLLABLES_REG_EXP =
|
const FIRST_CHAR_SYLLABLES_REG_EXP =
|
||||||
"[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]";
|
"[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]";
|
||||||
|
|
||||||
|
const NFKC_CHARS_TO_NORMALIZE = new Map();
|
||||||
|
|
||||||
let noSyllablesRegExp = null;
|
let noSyllablesRegExp = null;
|
||||||
let withSyllablesRegExp = null;
|
let withSyllablesRegExp = null;
|
||||||
|
|
||||||
@ -126,7 +128,13 @@ function normalize(text) {
|
|||||||
} else {
|
} else {
|
||||||
// Compile the regular expression for text normalization once.
|
// Compile the regular expression for text normalization once.
|
||||||
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
||||||
const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`;
|
const toNormalizeWithNFKC =
|
||||||
|
"\u2460-\u2473" + // Circled numbers.
|
||||||
|
"\u24b6-\u24ff" + // Circled letters/numbers.
|
||||||
|
"\u3244-\u32bf" + // Circled ideograms/numbers.
|
||||||
|
"\u32d0-\u32fe" + // Circled ideograms.
|
||||||
|
"\uff00-\uffef"; // Halfwidth, fullwidth forms.
|
||||||
|
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`;
|
||||||
|
|
||||||
if (syllablePositions.length === 0) {
|
if (syllablePositions.length === 0) {
|
||||||
// Most of the syllables belong to Hangul so there are no need
|
// Most of the syllables belong to Hangul so there are no need
|
||||||
@ -188,11 +196,11 @@ function normalize(text) {
|
|||||||
|
|
||||||
normalized = normalized.replace(
|
normalized = normalized.replace(
|
||||||
normalizationRegex,
|
normalizationRegex,
|
||||||
(match, p1, p2, p3, p4, p5, p6, i) => {
|
(match, p1, p2, p3, p4, p5, p6, p7, i) => {
|
||||||
i -= shiftOrigin;
|
i -= shiftOrigin;
|
||||||
if (p1) {
|
if (p1) {
|
||||||
// Maybe fractions or quotations mark...
|
// Maybe fractions or quotations mark...
|
||||||
const replacement = CHARACTERS_TO_NORMALIZE[match];
|
const replacement = CHARACTERS_TO_NORMALIZE[p1];
|
||||||
const jj = replacement.length;
|
const jj = replacement.length;
|
||||||
for (let j = 1; j < jj; j++) {
|
for (let j = 1; j < jj; j++) {
|
||||||
positions.push([i - shift + j, shift - j]);
|
positions.push([i - shift + j, shift - j]);
|
||||||
@ -202,8 +210,23 @@ function normalize(text) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (p2) {
|
if (p2) {
|
||||||
const hasTrailingDashEOL = p2.endsWith("\n");
|
// Use the NFKC representation to normalize the char.
|
||||||
const len = hasTrailingDashEOL ? p2.length - 2 : p2.length;
|
let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2);
|
||||||
|
if (!replacement) {
|
||||||
|
replacement = p2.normalize("NFKC");
|
||||||
|
NFKC_CHARS_TO_NORMALIZE.set(p2, replacement);
|
||||||
|
}
|
||||||
|
const jj = replacement.length;
|
||||||
|
for (let j = 1; j < jj; j++) {
|
||||||
|
positions.push([i - shift + j, shift - j]);
|
||||||
|
}
|
||||||
|
shift -= jj - 1;
|
||||||
|
return replacement;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p3) {
|
||||||
|
const hasTrailingDashEOL = p3.endsWith("\n");
|
||||||
|
const len = hasTrailingDashEOL ? p3.length - 2 : p3.length;
|
||||||
|
|
||||||
// Diacritics.
|
// Diacritics.
|
||||||
hasDiacritics = true;
|
hasDiacritics = true;
|
||||||
@ -223,19 +246,19 @@ function normalize(text) {
|
|||||||
|
|
||||||
if (hasTrailingDashEOL) {
|
if (hasTrailingDashEOL) {
|
||||||
// Diacritics are followed by a -\n.
|
// Diacritics are followed by a -\n.
|
||||||
// See comments in `if (p3)` block.
|
// See comments in `if (p4)` block.
|
||||||
i += len - 1;
|
i += len - 1;
|
||||||
positions.push([i - shift + 1, 1 + shift]);
|
positions.push([i - shift + 1, 1 + shift]);
|
||||||
shift += 1;
|
shift += 1;
|
||||||
shiftOrigin += 1;
|
shiftOrigin += 1;
|
||||||
eol += 1;
|
eol += 1;
|
||||||
return p2.slice(0, len);
|
return p3.slice(0, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
return p2;
|
return p3;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p3) {
|
if (p4) {
|
||||||
// "X-\n" is removed because an hyphen at the end of a line
|
// "X-\n" is removed because an hyphen at the end of a line
|
||||||
// with not a space before is likely here to mark a break
|
// with not a space before is likely here to mark a break
|
||||||
// in a word.
|
// in a word.
|
||||||
@ -244,19 +267,19 @@ function normalize(text) {
|
|||||||
shift += 1;
|
shift += 1;
|
||||||
shiftOrigin += 1;
|
shiftOrigin += 1;
|
||||||
eol += 1;
|
eol += 1;
|
||||||
return p3.charAt(0);
|
return p4.charAt(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p4) {
|
if (p5) {
|
||||||
// An ideographic at the end of a line doesn't imply adding an extra
|
// An ideographic at the end of a line doesn't imply adding an extra
|
||||||
// white space.
|
// white space.
|
||||||
positions.push([i - shift + 1, shift]);
|
positions.push([i - shift + 1, shift]);
|
||||||
shiftOrigin += 1;
|
shiftOrigin += 1;
|
||||||
eol += 1;
|
eol += 1;
|
||||||
return p4.charAt(0);
|
return p5.charAt(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p5) {
|
if (p6) {
|
||||||
// eol is replaced by space: "foo\nbar" is likely equivalent to
|
// eol is replaced by space: "foo\nbar" is likely equivalent to
|
||||||
// "foo bar".
|
// "foo bar".
|
||||||
positions.push([i - shift + 1, shift - 1]);
|
positions.push([i - shift + 1, shift - 1]);
|
||||||
@ -266,7 +289,7 @@ function normalize(text) {
|
|||||||
return " ";
|
return " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
// p6
|
// p7
|
||||||
if (i + eol === syllablePositions[syllableIndex]?.[1]) {
|
if (i + eol === syllablePositions[syllableIndex]?.[1]) {
|
||||||
// A syllable (1 char) is replaced with several chars (n) so
|
// A syllable (1 char) is replaced with several chars (n) so
|
||||||
// newCharsLen = n - 1.
|
// newCharsLen = n - 1.
|
||||||
@ -278,7 +301,7 @@ function normalize(text) {
|
|||||||
shift -= newCharLen;
|
shift -= newCharLen;
|
||||||
shiftOrigin += newCharLen;
|
shiftOrigin += newCharLen;
|
||||||
}
|
}
|
||||||
return p6;
|
return p7;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user