From 18f4e560ae4103bd7ece97ecf4b71a7f37596a17 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Sun, 13 Feb 2022 16:55:56 +0100 Subject: [PATCH] =?UTF-8?q?[Search]=20Some=20matches=20were=20incorrectly?= =?UTF-8?q?=20shifted=20because=20of=20some=20'-\n'=20-=20it=20aims=20to?= =?UTF-8?q?=20fix=20#14562;=20-=20'X-\n'=20were=20not=20correctly=20positi?= =?UTF-8?q?oned;=20-=20when=20X=20is=20a=20diacritic=20(e.g.=20in=20"s?= =?UTF-8?q?=C3=A4-\n",=20which=20is=20decomposed=20into=20"sa=C2=A8-\n")?= =?UTF-8?q?=20we=20must=20handle=20both=20things:=20=20=20-=20diacritics?= =?UTF-8?q?=20on=20the=20one=20hand;=20=20=20-=20"-\n"=20on=20the=20other?= =?UTF-8?q?=20hand.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/pdfs/issue14562.pdf.link | 1 + test/test_manifest.json | 7 +++ test/unit/pdf_find_controller_spec.js | 45 ++++++++++++++++ web/pdf_find_controller.js | 78 ++++++++++++++++----------- 4 files changed, 100 insertions(+), 31 deletions(-) create mode 100644 test/pdfs/issue14562.pdf.link diff --git a/test/pdfs/issue14562.pdf.link b/test/pdfs/issue14562.pdf.link new file mode 100644 index 000000000..c729c63be --- /dev/null +++ b/test/pdfs/issue14562.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/8055863/NHA-20211231-A-018_before.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index 41f7f7765..e8fa8695c 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -6271,5 +6271,12 @@ "value": "PDF.js PDF.js PDF.js" } } + }, + { "id": "issue14562", + "file": "pdfs/issue14562.pdf", + "md5": "565b0a1e46a32e907837506a10d25277", + "rounds": 1, + "link": true, + "type": "other" } ] diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index ee1c53a9b..de8f08507 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -16,6 +16,7 @@ import { buildGetDocumentParams } from "./test_utils.js"; import { EventBus } from "../../web/event_utils.js"; import { getDocument } from "../../src/display/api.js"; +import { isNodeJS } from "../../src/shared/is_node.js"; import { PDFFindController } from "../../web/pdf_find_controller.js"; import { SimpleLinkService } from "../../web/pdf_link_service.js"; @@ -554,4 +555,48 @@ describe("pdf_find_controller", function () { pageMatchesLength: [[23]], }); }); + + it("performs a search in a text containing diacritics before -\\n", async function () { + if (isNodeJS) { + pending("Linked test-cases are not supported in Node.js."); + } + + const { eventBus, pdfFindController } = await initPdfFindController( + "issue14562.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "รค", + matchDiacritics: true, + }, + matchesPerPage: [80], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [ + [ + 299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152, + 1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786, + 1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272, + 2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576, + 2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902, + 2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477, + 3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058, + 4071, + ], + ], + pageMatchesLength: [ + [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + ], + ], + }); + }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 218c9ae98..7b63da6ef 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -96,7 +96,7 @@ function normalize(text) { // Compile the regular expression for text normalization once. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); normalizationRegex = new RegExp( - `([${replace}])|(\\S-\\n)|(\\n)|(\\p{M}+)`, + `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`, "gum" ); } @@ -159,43 +159,59 @@ function normalize(text) { } if (p2) { - // "X-\n" is removed because an hyphen at the end of a line - // with not a space before is likely here to mark a break - // in a word. - positions.push([i - shift, 1 + shift]); - shift += 1; - shiftOrigin += 1; - eol += 1; - return p2.charAt(0); + const hasTrailingDashEOL = p2.endsWith("\n"); + const len = hasTrailingDashEOL ? p2.length - 2 : p2.length; + + // Diacritics. + hasDiacritics = true; + let jj = len; + if (i + eol === rawDiacriticsPositions[k]?.[1]) { + jj -= rawDiacriticsPositions[k][0]; + ++k; + } + + for (let j = 1; j < jj + 1; j++) { + // i is the position of the first diacritic + // so (i - 1) is the position for the letter before. + positions.push([i - 1 - shift + j, shift - j]); + } + shift -= jj; + shiftOrigin += jj; + + if (hasTrailingDashEOL) { + // Diacritics are followed by a -\n. + // See comments in `if (p3)` block. + i += len - 1; + positions.push([i - shift + 1, 1 + shift]); + shift += 1; + shiftOrigin += 1; + eol += 1; + return p2.slice(0, len); + } + + return p2; } if (p3) { - // eol is replaced by space: "foo\nbar" is likely equivalent to - // "foo bar". - positions.push([i - shift + 1, shift - 1]); - shift -= 1; + // "X-\n" is removed because an hyphen at the end of a line + // with not a space before is likely here to mark a break + // in a word. + // The \n isn't in the original text so here y = i, n = 1 and o = 2. + positions.push([i - shift + 1, 1 + shift]); + shift += 1; shiftOrigin += 1; eol += 1; - return " "; + return p3.charAt(0); } - // Diacritics. - hasDiacritics = true; - let jj = p4.length; - if (i + eol === rawDiacriticsPositions[k]?.[1]) { - jj -= rawDiacriticsPositions[k][0]; - ++k; - } - - for (let j = 1; j < jj + 1; j++) { - // i is the position of the first diacritic - // so (i - 1) is the position for the letter before. - positions.push([i - 1 - shift + j, shift - j]); - } - shift -= jj; - shiftOrigin += jj; - - return p4; + // p4 + // eol is replaced by space: "foo\nbar" is likely equivalent to + // "foo bar". + positions.push([i - shift + 1, shift - 1]); + shift -= 1; + shiftOrigin += 1; + eol += 1; + return " "; } );