Normalize fullwidth, halfwidth and circled chars when searching
This commit is contained in:
		
							parent
							
								
									bfe6ff5893
								
							
						
					
					
						commit
						2be64d63e1
					
				
							
								
								
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -554,3 +554,4 @@ | ||||
| !bug1796741.pdf | ||||
| !textfields.pdf | ||||
| !freetext_no_appearance.pdf | ||||
| !issue15690.pdf | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								test/pdfs/issue15690.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/pdfs/issue15690.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -647,4 +647,25 @@ describe("pdf_find_controller", function () { | ||||
|       pageMatchesLength: [[4]], | ||||
|     }); | ||||
|   }); | ||||
| 
 | ||||
|   it("performs a search in a text containing fullwidth chars", async function () { | ||||
|     const { eventBus, pdfFindController } = await initPdfFindController( | ||||
|       "issue15690.pdf" | ||||
|     ); | ||||
| 
 | ||||
|     await testSearch({ | ||||
|       eventBus, | ||||
|       pdfFindController, | ||||
|       state: { | ||||
|         query: "o", | ||||
|       }, | ||||
|       matchesPerPage: [13], | ||||
|       selectedMatch: { | ||||
|         pageIndex: 0, | ||||
|         matchIndex: 0, | ||||
|       }, | ||||
|       pageMatches: [[0, 10, 13, 30, 39, 41, 55, 60, 66, 84, 102, 117, 134]], | ||||
|       pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], | ||||
|     }); | ||||
|   }); | ||||
| }); | ||||
|  | ||||
| @ -95,6 +95,8 @@ const SYLLABLES_LENGTHS = new Map(); | ||||
| const FIRST_CHAR_SYLLABLES_REG_EXP = | ||||
|   "[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]"; | ||||
| 
 | ||||
| const NFKC_CHARS_TO_NORMALIZE = new Map(); | ||||
| 
 | ||||
| let noSyllablesRegExp = null; | ||||
| let withSyllablesRegExp = null; | ||||
| 
 | ||||
| @ -126,7 +128,13 @@ function normalize(text) { | ||||
|   } else { | ||||
|     // Compile the regular expression for text normalization once.
 | ||||
|     const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); | ||||
|     const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; | ||||
|     const toNormalizeWithNFKC = | ||||
|       "\u2460-\u2473" + // Circled numbers.
 | ||||
|       "\u24b6-\u24ff" + // Circled letters/numbers.
 | ||||
|       "\u3244-\u32bf" + // Circled ideograms/numbers.
 | ||||
|       "\u32d0-\u32fe" + // Circled ideograms.
 | ||||
|       "\uff00-\uffef"; // Halfwidth, fullwidth forms.
 | ||||
|     const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; | ||||
| 
 | ||||
|     if (syllablePositions.length === 0) { | ||||
|       // Most of the syllables belong to Hangul so there are no need
 | ||||
| @ -188,11 +196,11 @@ function normalize(text) { | ||||
| 
 | ||||
|   normalized = normalized.replace( | ||||
|     normalizationRegex, | ||||
|     (match, p1, p2, p3, p4, p5, p6, i) => { | ||||
|     (match, p1, p2, p3, p4, p5, p6, p7, i) => { | ||||
|       i -= shiftOrigin; | ||||
|       if (p1) { | ||||
|         // Maybe fractions or quotations mark...
 | ||||
|         const replacement = CHARACTERS_TO_NORMALIZE[match]; | ||||
|         const replacement = CHARACTERS_TO_NORMALIZE[p1]; | ||||
|         const jj = replacement.length; | ||||
|         for (let j = 1; j < jj; j++) { | ||||
|           positions.push([i - shift + j, shift - j]); | ||||
| @ -202,8 +210,23 @@ function normalize(text) { | ||||
|       } | ||||
| 
 | ||||
|       if (p2) { | ||||
|         const hasTrailingDashEOL = p2.endsWith("\n"); | ||||
|         const len = hasTrailingDashEOL ? p2.length - 2 : p2.length; | ||||
|         // Use the NFKC representation to normalize the char.
 | ||||
|         let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2); | ||||
|         if (!replacement) { | ||||
|           replacement = p2.normalize("NFKC"); | ||||
|           NFKC_CHARS_TO_NORMALIZE.set(p2, replacement); | ||||
|         } | ||||
|         const jj = replacement.length; | ||||
|         for (let j = 1; j < jj; j++) { | ||||
|           positions.push([i - shift + j, shift - j]); | ||||
|         } | ||||
|         shift -= jj - 1; | ||||
|         return replacement; | ||||
|       } | ||||
| 
 | ||||
|       if (p3) { | ||||
|         const hasTrailingDashEOL = p3.endsWith("\n"); | ||||
|         const len = hasTrailingDashEOL ? p3.length - 2 : p3.length; | ||||
| 
 | ||||
|         // Diacritics.
 | ||||
|         hasDiacritics = true; | ||||
| @ -223,19 +246,19 @@ function normalize(text) { | ||||
| 
 | ||||
|         if (hasTrailingDashEOL) { | ||||
|           // Diacritics are followed by a -\n.
 | ||||
|           // See comments in `if (p3)` block.
 | ||||
|           // See comments in `if (p4)` block.
 | ||||
|           i += len - 1; | ||||
|           positions.push([i - shift + 1, 1 + shift]); | ||||
|           shift += 1; | ||||
|           shiftOrigin += 1; | ||||
|           eol += 1; | ||||
|           return p2.slice(0, len); | ||||
|           return p3.slice(0, len); | ||||
|         } | ||||
| 
 | ||||
|         return p2; | ||||
|         return p3; | ||||
|       } | ||||
| 
 | ||||
|       if (p3) { | ||||
|       if (p4) { | ||||
|         // "X-\n" is removed because an hyphen at the end of a line
 | ||||
|         // with not a space before is likely here to mark a break
 | ||||
|         // in a word.
 | ||||
| @ -244,19 +267,19 @@ function normalize(text) { | ||||
|         shift += 1; | ||||
|         shiftOrigin += 1; | ||||
|         eol += 1; | ||||
|         return p3.charAt(0); | ||||
|         return p4.charAt(0); | ||||
|       } | ||||
| 
 | ||||
|       if (p4) { | ||||
|       if (p5) { | ||||
|         // An ideographic at the end of a line doesn't imply adding an extra
 | ||||
|         // white space.
 | ||||
|         positions.push([i - shift + 1, shift]); | ||||
|         shiftOrigin += 1; | ||||
|         eol += 1; | ||||
|         return p4.charAt(0); | ||||
|         return p5.charAt(0); | ||||
|       } | ||||
| 
 | ||||
|       if (p5) { | ||||
|       if (p6) { | ||||
|         // eol is replaced by space: "foo\nbar" is likely equivalent to
 | ||||
|         // "foo bar".
 | ||||
|         positions.push([i - shift + 1, shift - 1]); | ||||
| @ -266,7 +289,7 @@ function normalize(text) { | ||||
|         return " "; | ||||
|       } | ||||
| 
 | ||||
|       // p6
 | ||||
|       // p7
 | ||||
|       if (i + eol === syllablePositions[syllableIndex]?.[1]) { | ||||
|         // A syllable (1 char) is replaced with several chars (n) so
 | ||||
|         // newCharsLen = n - 1.
 | ||||
| @ -278,7 +301,7 @@ function normalize(text) { | ||||
|         shift -= newCharLen; | ||||
|         shiftOrigin += newCharLen; | ||||
|       } | ||||
|       return p6; | ||||
|       return p7; | ||||
|     } | ||||
|   ); | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user