[Search] Some matches were incorrectly shifted because of some '-\n'

- it aims to fix #14562;
- 'X-\n' were not correctly positioned;
- when X is a diacritic (e.g. in "sä-\n", which is decomposed into "sa¨-\n") we must handle both things:
  - diacritics on the one hand;
  - "-\n" on the other hand.
This commit is contained in:
Calixte Denizet 2022-02-13 16:55:56 +01:00
parent 263c89581f
commit 18f4e560ae
4 changed files with 100 additions and 31 deletions

View File

@ -0,0 +1 @@
https://github.com/mozilla/pdf.js/files/8055863/NHA-20211231-A-018_before.pdf

View File

@ -6271,5 +6271,12 @@
"value": "PDF.js PDF.js PDF.js"
}
}
},
{ "id": "issue14562",
"file": "pdfs/issue14562.pdf",
"md5": "565b0a1e46a32e907837506a10d25277",
"rounds": 1,
"link": true,
"type": "other"
}
]

View File

@ -16,6 +16,7 @@
import { buildGetDocumentParams } from "./test_utils.js";
import { EventBus } from "../../web/event_utils.js";
import { getDocument } from "../../src/display/api.js";
import { isNodeJS } from "../../src/shared/is_node.js";
import { PDFFindController } from "../../web/pdf_find_controller.js";
import { SimpleLinkService } from "../../web/pdf_link_service.js";
@ -554,4 +555,48 @@ describe("pdf_find_controller", function () {
pageMatchesLength: [[23]],
});
});
it("performs a search in a text containing diacritics before -\\n", async function () {
if (isNodeJS) {
pending("Linked test-cases are not supported in Node.js.");
}
const { eventBus, pdfFindController } = await initPdfFindController(
"issue14562.pdf"
);
await testSearch({
eventBus,
pdfFindController,
state: {
query: "ä",
matchDiacritics: true,
},
matchesPerPage: [80],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [
[
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
4071,
],
],
pageMatchesLength: [
[
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
],
],
});
});
});

View File

@ -96,7 +96,7 @@ function normalize(text) {
// Compile the regular expression for text normalization once.
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
normalizationRegex = new RegExp(
`([${replace}])|(\\S-\\n)|(\\n)|(\\p{M}+)`,
`([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`,
"gum"
);
}
@ -159,43 +159,59 @@ function normalize(text) {
}
if (p2) {
// "X-\n" is removed because an hyphen at the end of a line
// with not a space before is likely here to mark a break
// in a word.
positions.push([i - shift, 1 + shift]);
shift += 1;
shiftOrigin += 1;
eol += 1;
return p2.charAt(0);
const hasTrailingDashEOL = p2.endsWith("\n");
const len = hasTrailingDashEOL ? p2.length - 2 : p2.length;
// Diacritics.
hasDiacritics = true;
let jj = len;
if (i + eol === rawDiacriticsPositions[k]?.[1]) {
jj -= rawDiacriticsPositions[k][0];
++k;
}
for (let j = 1; j < jj + 1; j++) {
// i is the position of the first diacritic
// so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + j, shift - j]);
}
shift -= jj;
shiftOrigin += jj;
if (hasTrailingDashEOL) {
// Diacritics are followed by a -\n.
// See comments in `if (p3)` block.
i += len - 1;
positions.push([i - shift + 1, 1 + shift]);
shift += 1;
shiftOrigin += 1;
eol += 1;
return p2.slice(0, len);
}
return p2;
}
if (p3) {
// eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar".
positions.push([i - shift + 1, shift - 1]);
shift -= 1;
// "X-\n" is removed because an hyphen at the end of a line
// with not a space before is likely here to mark a break
// in a word.
// The \n isn't in the original text so here y = i, n = 1 and o = 2.
positions.push([i - shift + 1, 1 + shift]);
shift += 1;
shiftOrigin += 1;
eol += 1;
return " ";
return p3.charAt(0);
}
// Diacritics.
hasDiacritics = true;
let jj = p4.length;
if (i + eol === rawDiacriticsPositions[k]?.[1]) {
jj -= rawDiacriticsPositions[k][0];
++k;
}
for (let j = 1; j < jj + 1; j++) {
// i is the position of the first diacritic
// so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + j, shift - j]);
}
shift -= jj;
shiftOrigin += jj;
return p4;
// p4
// eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar".
positions.push([i - shift + 1, shift - 1]);
shift -= 1;
shiftOrigin += 1;
eol += 1;
return " ";
}
);