diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index f5b148498..dd1ed3356 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -557,3 +557,4 @@ !freetext_no_appearance.pdf !issue15690.pdf !bug1802888.pdf +!issue15759.pdf diff --git a/test/pdfs/issue15759.pdf b/test/pdfs/issue15759.pdf new file mode 100755 index 000000000..494898d17 Binary files /dev/null and b/test/pdfs/issue15759.pdf differ diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index 235e2a6eb..271a758f0 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -668,4 +668,25 @@ describe("pdf_find_controller", function () { pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], }); }); + + it("performs a search in a text with some Katakana at the end of a line", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "issue15759.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "ソレノイド", + }, + matchesPerPage: [1], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[6]], + pageMatchesLength: [[5]], + }); + }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 276015d72..3dab4fa89 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -132,7 +132,11 @@ function normalize(text) { "\u3244-\u32bf" + // Circled ideograms/numbers. "\u32d0-\u32fe" + // Circled ideograms. "\uff00-\uffef"; // Halfwidth, fullwidth forms. - const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; + + // 3040-309F: Hiragana + // 30A0-30FF: Katakana + const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])"; + const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`; if (syllablePositions.length === 0) { // Most of the syllables belong to Hangul so there are no need