Merge pull request #14965 from calixteman/1771477

Support Hangul syllables when searching some text (bug 1771477)
This commit is contained in:
calixteman 2022-05-28 17:52:27 +02:00 committed by GitHub
commit 5b3fdee5f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 109 additions and 22 deletions

View File

@ -524,3 +524,5 @@
!issue14627.pdf
!issue14862.pdf
!issue14705.pdf
!bug1771477.pdf

BIN
test/pdfs/bug1771477.pdf Normal file

Binary file not shown.

View File

@ -21,6 +21,10 @@ import { PDFFindController } from "../../web/pdf_find_controller.js";
import { SimpleLinkService } from "../../web/pdf_link_service.js";
const tracemonkeyFileName = "tracemonkey.pdf";
const CMAP_PARAMS = {
cMapUrl: isNodeJS ? "./external/bcmaps/" : "../../../external/bcmaps/",
cMapPacked: true,
};
class MockLinkService extends SimpleLinkService {
constructor() {
@ -49,7 +53,9 @@ class MockLinkService extends SimpleLinkService {
async function initPdfFindController(filename) {
const loadingTask = getDocument(
buildGetDocumentParams(filename || tracemonkeyFileName)
buildGetDocumentParams(filename || tracemonkeyFileName, {
...CMAP_PARAMS,
})
);
const pdfDocument = await loadingTask.promise;
@ -599,4 +605,25 @@ describe("pdf_find_controller", function () {
],
});
});
it("performs a search in a text containing some Hangul syllables", async function () {
const { eventBus, pdfFindController } = await initPdfFindController(
"bug1771477.pdf"
);
await testSearch({
eventBus,
pdfFindController,
state: {
query: "안녕하세요 세계",
},
matchesPerPage: [1],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[139]],
pageMatchesLength: [[8]],
});
});
});

View File

@ -86,19 +86,62 @@ const SPECIAL_CHARS_REG_EXP =
const NOT_DIACRITIC_FROM_END_REG_EXP = /([^\p{M}])\p{M}*$/u;
const NOT_DIACRITIC_FROM_START_REG_EXP = /^\p{M}*([^\p{M}])/u;
let normalizationRegex = null;
// The range [AC00-D7AF] corresponds to the Hangul syllables.
// The few other chars are some CJK Compatibility Ideographs.
const SYLLABLES_REG_EXP = /[\uAC00-\uD7AF\uFA6C\uFACF-\uFAD1\uFAD5-\uFAD7]+/g;
const SYLLABLES_LENGTHS = new Map();
// When decomposed (in using NFD) the above syllables will start
// with one of the chars in this regexp.
const FIRST_CHAR_SYLLABLES_REG_EXP =
"[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]";
let noSyllablesRegExp = null;
let withSyllablesRegExp = null;
function normalize(text) {
// The diacritics in the text or in the query can be composed or not.
// So we use a decomposed text using NFD (and the same for the query)
// in order to be sure that diacritics are in the same order.
if (!normalizationRegex) {
// Collect syllables length and positions.
const syllablePositions = [];
let m;
while ((m = SYLLABLES_REG_EXP.exec(text)) !== null) {
let { index } = m;
for (const char of m[0]) {
let len = SYLLABLES_LENGTHS.get(char);
if (!len) {
len = char.normalize("NFD").length;
SYLLABLES_LENGTHS.set(char, len);
}
syllablePositions.push([len, index++]);
}
}
let normalizationRegex;
if (syllablePositions.length === 0 && noSyllablesRegExp) {
normalizationRegex = noSyllablesRegExp;
} else if (syllablePositions.length > 0 && withSyllablesRegExp) {
normalizationRegex = withSyllablesRegExp;
} else {
// Compile the regular expression for text normalization once.
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
normalizationRegex = new RegExp(
`([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`,
"gum"
);
const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`;
if (syllablePositions.length === 0) {
// Most of the syllables belong to Hangul so there are no need
// to search for them in a non-Hangul document.
// We use the \0 in order to have the same number of groups.
normalizationRegex = noSyllablesRegExp = new RegExp(
regexp + "|(\\u0000)",
"gum"
);
} else {
normalizationRegex = withSyllablesRegExp = new RegExp(
regexp + `|(${FIRST_CHAR_SYLLABLES_REG_EXP})`,
"gum"
);
}
}
// The goal of this function is to normalize the string and
@ -130,14 +173,14 @@ function normalize(text) {
// Collect diacritics length and positions.
const rawDiacriticsPositions = [];
let m;
while ((m = DIACRITICS_REG_EXP.exec(text)) !== null) {
rawDiacriticsPositions.push([m[0].length, m.index]);
}
let normalized = text.normalize("NFD");
const positions = [[0, 0]];
let k = 0;
let rawDiacriticsIndex = 0;
let syllableIndex = 0;
let shift = 0;
let shiftOrigin = 0;
let eol = 0;
@ -145,7 +188,7 @@ function normalize(text) {
normalized = normalized.replace(
normalizationRegex,
(match, p1, p2, p3, p4, i) => {
(match, p1, p2, p3, p4, p5, i) => {
i -= shiftOrigin;
if (p1) {
// Maybe fractions or quotations mark...
@ -165,12 +208,12 @@ function normalize(text) {
// Diacritics.
hasDiacritics = true;
let jj = len;
if (i + eol === rawDiacriticsPositions[k]?.[1]) {
jj -= rawDiacriticsPositions[k][0];
++k;
if (i + eol === rawDiacriticsPositions[rawDiacriticsIndex]?.[1]) {
jj -= rawDiacriticsPositions[rawDiacriticsIndex][0];
++rawDiacriticsIndex;
}
for (let j = 1; j < jj + 1; j++) {
for (let j = 1; j <= jj; j++) {
// i is the position of the first diacritic
// so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + j, shift - j]);
@ -204,14 +247,29 @@ function normalize(text) {
return p3.charAt(0);
}
// p4
// eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar".
positions.push([i - shift + 1, shift - 1]);
shift -= 1;
shiftOrigin += 1;
eol += 1;
return " ";
if (p4) {
// eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar".
positions.push([i - shift + 1, shift - 1]);
shift -= 1;
shiftOrigin += 1;
eol += 1;
return " ";
}
// p5
if (i + eol === syllablePositions[syllableIndex]?.[1]) {
// A syllable (1 char) is replaced with several chars (n) so
// newCharsLen = n - 1.
const newCharLen = syllablePositions[syllableIndex][0] - 1;
++syllableIndex;
for (let j = 1; j <= newCharLen; j++) {
positions.push([i - (shift - j), shift - j]);
}
shift -= newCharLen;
shiftOrigin += newCharLen;
}
return p5;
}
);