[Search] Some matches were incorrectly shifted because of some '-\n'
- it aims to fix #14562; - 'X-\n' were not correctly positioned; - when X is a diacritic (e.g. in "sä-\n", which is decomposed into "sa¨-\n") we must handle both things: - diacritics on the one hand; - "-\n" on the other hand.
This commit is contained in:
parent
263c89581f
commit
18f4e560ae
1
test/pdfs/issue14562.pdf.link
Normal file
1
test/pdfs/issue14562.pdf.link
Normal file
@ -0,0 +1 @@
|
||||
https://github.com/mozilla/pdf.js/files/8055863/NHA-20211231-A-018_before.pdf
|
@ -6271,5 +6271,12 @@
|
||||
"value": "PDF.js PDF.js PDF.js"
|
||||
}
|
||||
}
|
||||
},
|
||||
{ "id": "issue14562",
|
||||
"file": "pdfs/issue14562.pdf",
|
||||
"md5": "565b0a1e46a32e907837506a10d25277",
|
||||
"rounds": 1,
|
||||
"link": true,
|
||||
"type": "other"
|
||||
}
|
||||
]
|
||||
|
@ -16,6 +16,7 @@
|
||||
import { buildGetDocumentParams } from "./test_utils.js";
|
||||
import { EventBus } from "../../web/event_utils.js";
|
||||
import { getDocument } from "../../src/display/api.js";
|
||||
import { isNodeJS } from "../../src/shared/is_node.js";
|
||||
import { PDFFindController } from "../../web/pdf_find_controller.js";
|
||||
import { SimpleLinkService } from "../../web/pdf_link_service.js";
|
||||
|
||||
@ -554,4 +555,48 @@ describe("pdf_find_controller", function () {
|
||||
pageMatchesLength: [[23]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search in a text containing diacritics before -\\n", async function () {
|
||||
if (isNodeJS) {
|
||||
pending("Linked test-cases are not supported in Node.js.");
|
||||
}
|
||||
|
||||
const { eventBus, pdfFindController } = await initPdfFindController(
|
||||
"issue14562.pdf"
|
||||
);
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "ä",
|
||||
matchDiacritics: true,
|
||||
},
|
||||
matchesPerPage: [80],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [
|
||||
[
|
||||
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
|
||||
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
|
||||
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
|
||||
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
|
||||
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
|
||||
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
|
||||
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
|
||||
4071,
|
||||
],
|
||||
],
|
||||
pageMatchesLength: [
|
||||
[
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
],
|
||||
],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -96,7 +96,7 @@ function normalize(text) {
|
||||
// Compile the regular expression for text normalization once.
|
||||
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
||||
normalizationRegex = new RegExp(
|
||||
`([${replace}])|(\\S-\\n)|(\\n)|(\\p{M}+)`,
|
||||
`([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`,
|
||||
"gum"
|
||||
);
|
||||
}
|
||||
@ -159,43 +159,59 @@ function normalize(text) {
|
||||
}
|
||||
|
||||
if (p2) {
|
||||
// "X-\n" is removed because an hyphen at the end of a line
|
||||
// with not a space before is likely here to mark a break
|
||||
// in a word.
|
||||
positions.push([i - shift, 1 + shift]);
|
||||
shift += 1;
|
||||
shiftOrigin += 1;
|
||||
eol += 1;
|
||||
return p2.charAt(0);
|
||||
const hasTrailingDashEOL = p2.endsWith("\n");
|
||||
const len = hasTrailingDashEOL ? p2.length - 2 : p2.length;
|
||||
|
||||
// Diacritics.
|
||||
hasDiacritics = true;
|
||||
let jj = len;
|
||||
if (i + eol === rawDiacriticsPositions[k]?.[1]) {
|
||||
jj -= rawDiacriticsPositions[k][0];
|
||||
++k;
|
||||
}
|
||||
|
||||
for (let j = 1; j < jj + 1; j++) {
|
||||
// i is the position of the first diacritic
|
||||
// so (i - 1) is the position for the letter before.
|
||||
positions.push([i - 1 - shift + j, shift - j]);
|
||||
}
|
||||
shift -= jj;
|
||||
shiftOrigin += jj;
|
||||
|
||||
if (hasTrailingDashEOL) {
|
||||
// Diacritics are followed by a -\n.
|
||||
// See comments in `if (p3)` block.
|
||||
i += len - 1;
|
||||
positions.push([i - shift + 1, 1 + shift]);
|
||||
shift += 1;
|
||||
shiftOrigin += 1;
|
||||
eol += 1;
|
||||
return p2.slice(0, len);
|
||||
}
|
||||
|
||||
return p2;
|
||||
}
|
||||
|
||||
if (p3) {
|
||||
// eol is replaced by space: "foo\nbar" is likely equivalent to
|
||||
// "foo bar".
|
||||
positions.push([i - shift + 1, shift - 1]);
|
||||
shift -= 1;
|
||||
// "X-\n" is removed because an hyphen at the end of a line
|
||||
// with not a space before is likely here to mark a break
|
||||
// in a word.
|
||||
// The \n isn't in the original text so here y = i, n = 1 and o = 2.
|
||||
positions.push([i - shift + 1, 1 + shift]);
|
||||
shift += 1;
|
||||
shiftOrigin += 1;
|
||||
eol += 1;
|
||||
return " ";
|
||||
return p3.charAt(0);
|
||||
}
|
||||
|
||||
// Diacritics.
|
||||
hasDiacritics = true;
|
||||
let jj = p4.length;
|
||||
if (i + eol === rawDiacriticsPositions[k]?.[1]) {
|
||||
jj -= rawDiacriticsPositions[k][0];
|
||||
++k;
|
||||
}
|
||||
|
||||
for (let j = 1; j < jj + 1; j++) {
|
||||
// i is the position of the first diacritic
|
||||
// so (i - 1) is the position for the letter before.
|
||||
positions.push([i - 1 - shift + j, shift - j]);
|
||||
}
|
||||
shift -= jj;
|
||||
shiftOrigin += jj;
|
||||
|
||||
return p4;
|
||||
// p4
|
||||
// eol is replaced by space: "foo\nbar" is likely equivalent to
|
||||
// "foo bar".
|
||||
positions.push([i - shift + 1, shift - 1]);
|
||||
shift -= 1;
|
||||
shiftOrigin += 1;
|
||||
eol += 1;
|
||||
return " ";
|
||||
}
|
||||
);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user