Normalize fullwidth, halfwidth and circled chars when searching
This commit is contained in:
		
							parent
							
								
									bfe6ff5893
								
							
						
					
					
						commit
						2be64d63e1
					
				
							
								
								
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -554,3 +554,4 @@ | |||||||
| !bug1796741.pdf | !bug1796741.pdf | ||||||
| !textfields.pdf | !textfields.pdf | ||||||
| !freetext_no_appearance.pdf | !freetext_no_appearance.pdf | ||||||
|  | !issue15690.pdf | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								test/pdfs/issue15690.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/pdfs/issue15690.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -647,4 +647,25 @@ describe("pdf_find_controller", function () { | |||||||
|       pageMatchesLength: [[4]], |       pageMatchesLength: [[4]], | ||||||
|     }); |     }); | ||||||
|   }); |   }); | ||||||
|  | 
 | ||||||
|  |   it("performs a search in a text containing fullwidth chars", async function () { | ||||||
|  |     const { eventBus, pdfFindController } = await initPdfFindController( | ||||||
|  |       "issue15690.pdf" | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  |     await testSearch({ | ||||||
|  |       eventBus, | ||||||
|  |       pdfFindController, | ||||||
|  |       state: { | ||||||
|  |         query: "o", | ||||||
|  |       }, | ||||||
|  |       matchesPerPage: [13], | ||||||
|  |       selectedMatch: { | ||||||
|  |         pageIndex: 0, | ||||||
|  |         matchIndex: 0, | ||||||
|  |       }, | ||||||
|  |       pageMatches: [[0, 10, 13, 30, 39, 41, 55, 60, 66, 84, 102, 117, 134]], | ||||||
|  |       pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], | ||||||
|  |     }); | ||||||
|  |   }); | ||||||
| }); | }); | ||||||
|  | |||||||
| @ -95,6 +95,8 @@ const SYLLABLES_LENGTHS = new Map(); | |||||||
| const FIRST_CHAR_SYLLABLES_REG_EXP = | const FIRST_CHAR_SYLLABLES_REG_EXP = | ||||||
|   "[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]"; |   "[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]"; | ||||||
| 
 | 
 | ||||||
|  | const NFKC_CHARS_TO_NORMALIZE = new Map(); | ||||||
|  | 
 | ||||||
| let noSyllablesRegExp = null; | let noSyllablesRegExp = null; | ||||||
| let withSyllablesRegExp = null; | let withSyllablesRegExp = null; | ||||||
| 
 | 
 | ||||||
| @ -126,7 +128,13 @@ function normalize(text) { | |||||||
|   } else { |   } else { | ||||||
|     // Compile the regular expression for text normalization once.
 |     // Compile the regular expression for text normalization once.
 | ||||||
|     const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); |     const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); | ||||||
|     const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; |     const toNormalizeWithNFKC = | ||||||
|  |       "\u2460-\u2473" + // Circled numbers.
 | ||||||
|  |       "\u24b6-\u24ff" + // Circled letters/numbers.
 | ||||||
|  |       "\u3244-\u32bf" + // Circled ideograms/numbers.
 | ||||||
|  |       "\u32d0-\u32fe" + // Circled ideograms.
 | ||||||
|  |       "\uff00-\uffef"; // Halfwidth, fullwidth forms.
 | ||||||
|  |     const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; | ||||||
| 
 | 
 | ||||||
|     if (syllablePositions.length === 0) { |     if (syllablePositions.length === 0) { | ||||||
|       // Most of the syllables belong to Hangul so there are no need
 |       // Most of the syllables belong to Hangul so there are no need
 | ||||||
| @ -188,11 +196,11 @@ function normalize(text) { | |||||||
| 
 | 
 | ||||||
|   normalized = normalized.replace( |   normalized = normalized.replace( | ||||||
|     normalizationRegex, |     normalizationRegex, | ||||||
|     (match, p1, p2, p3, p4, p5, p6, i) => { |     (match, p1, p2, p3, p4, p5, p6, p7, i) => { | ||||||
|       i -= shiftOrigin; |       i -= shiftOrigin; | ||||||
|       if (p1) { |       if (p1) { | ||||||
|         // Maybe fractions or quotations mark...
 |         // Maybe fractions or quotations mark...
 | ||||||
|         const replacement = CHARACTERS_TO_NORMALIZE[match]; |         const replacement = CHARACTERS_TO_NORMALIZE[p1]; | ||||||
|         const jj = replacement.length; |         const jj = replacement.length; | ||||||
|         for (let j = 1; j < jj; j++) { |         for (let j = 1; j < jj; j++) { | ||||||
|           positions.push([i - shift + j, shift - j]); |           positions.push([i - shift + j, shift - j]); | ||||||
| @ -202,8 +210,23 @@ function normalize(text) { | |||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|       if (p2) { |       if (p2) { | ||||||
|         const hasTrailingDashEOL = p2.endsWith("\n"); |         // Use the NFKC representation to normalize the char.
 | ||||||
|         const len = hasTrailingDashEOL ? p2.length - 2 : p2.length; |         let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2); | ||||||
|  |         if (!replacement) { | ||||||
|  |           replacement = p2.normalize("NFKC"); | ||||||
|  |           NFKC_CHARS_TO_NORMALIZE.set(p2, replacement); | ||||||
|  |         } | ||||||
|  |         const jj = replacement.length; | ||||||
|  |         for (let j = 1; j < jj; j++) { | ||||||
|  |           positions.push([i - shift + j, shift - j]); | ||||||
|  |         } | ||||||
|  |         shift -= jj - 1; | ||||||
|  |         return replacement; | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       if (p3) { | ||||||
|  |         const hasTrailingDashEOL = p3.endsWith("\n"); | ||||||
|  |         const len = hasTrailingDashEOL ? p3.length - 2 : p3.length; | ||||||
| 
 | 
 | ||||||
|         // Diacritics.
 |         // Diacritics.
 | ||||||
|         hasDiacritics = true; |         hasDiacritics = true; | ||||||
| @ -223,19 +246,19 @@ function normalize(text) { | |||||||
| 
 | 
 | ||||||
|         if (hasTrailingDashEOL) { |         if (hasTrailingDashEOL) { | ||||||
|           // Diacritics are followed by a -\n.
 |           // Diacritics are followed by a -\n.
 | ||||||
|           // See comments in `if (p3)` block.
 |           // See comments in `if (p4)` block.
 | ||||||
|           i += len - 1; |           i += len - 1; | ||||||
|           positions.push([i - shift + 1, 1 + shift]); |           positions.push([i - shift + 1, 1 + shift]); | ||||||
|           shift += 1; |           shift += 1; | ||||||
|           shiftOrigin += 1; |           shiftOrigin += 1; | ||||||
|           eol += 1; |           eol += 1; | ||||||
|           return p2.slice(0, len); |           return p3.slice(0, len); | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         return p2; |         return p3; | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|       if (p3) { |       if (p4) { | ||||||
|         // "X-\n" is removed because an hyphen at the end of a line
 |         // "X-\n" is removed because an hyphen at the end of a line
 | ||||||
|         // with not a space before is likely here to mark a break
 |         // with not a space before is likely here to mark a break
 | ||||||
|         // in a word.
 |         // in a word.
 | ||||||
| @ -244,19 +267,19 @@ function normalize(text) { | |||||||
|         shift += 1; |         shift += 1; | ||||||
|         shiftOrigin += 1; |         shiftOrigin += 1; | ||||||
|         eol += 1; |         eol += 1; | ||||||
|         return p3.charAt(0); |         return p4.charAt(0); | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|       if (p4) { |       if (p5) { | ||||||
|         // An ideographic at the end of a line doesn't imply adding an extra
 |         // An ideographic at the end of a line doesn't imply adding an extra
 | ||||||
|         // white space.
 |         // white space.
 | ||||||
|         positions.push([i - shift + 1, shift]); |         positions.push([i - shift + 1, shift]); | ||||||
|         shiftOrigin += 1; |         shiftOrigin += 1; | ||||||
|         eol += 1; |         eol += 1; | ||||||
|         return p4.charAt(0); |         return p5.charAt(0); | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|       if (p5) { |       if (p6) { | ||||||
|         // eol is replaced by space: "foo\nbar" is likely equivalent to
 |         // eol is replaced by space: "foo\nbar" is likely equivalent to
 | ||||||
|         // "foo bar".
 |         // "foo bar".
 | ||||||
|         positions.push([i - shift + 1, shift - 1]); |         positions.push([i - shift + 1, shift - 1]); | ||||||
| @ -266,7 +289,7 @@ function normalize(text) { | |||||||
|         return " "; |         return " "; | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|       // p6
 |       // p7
 | ||||||
|       if (i + eol === syllablePositions[syllableIndex]?.[1]) { |       if (i + eol === syllablePositions[syllableIndex]?.[1]) { | ||||||
|         // A syllable (1 char) is replaced with several chars (n) so
 |         // A syllable (1 char) is replaced with several chars (n) so
 | ||||||
|         // newCharsLen = n - 1.
 |         // newCharsLen = n - 1.
 | ||||||
| @ -278,7 +301,7 @@ function normalize(text) { | |||||||
|         shift -= newCharLen; |         shift -= newCharLen; | ||||||
|         shiftOrigin += newCharLen; |         shiftOrigin += newCharLen; | ||||||
|       } |       } | ||||||
|       return p6; |       return p7; | ||||||
|     } |     } | ||||||
|   ); |   ); | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user