Build a fallback ToUnicode map for simple fonts (issue 8229)
				
					
				
			In some fonts, the included `ToUnicode` data is incomplete causing text-selection to not work properly. For simple fonts that contain encoding data, we can manually build a `ToUnicode` map to attempt to improve things. Please note that since we're currently using the `ToUnicode` data during glyph mapping, in an attempt to avoid rendering regressions, I purposely didn't want to amend to original `ToUnicode` data for this text-selection edge-case. Instead, I opted for the current solution, which will (hopefully) give slightly better text-extraction results in PDF file with incomplete `ToUnicode` data. According to the PDF specification, see [section 9.10.2](http://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf#G8.1873172): > A conforming reader can use these methods, in the priority given, to map a character code to a Unicode value. > ... Reading that paragraph literally, it doesn't seem too unreasonable to use *different* methods for different charcodes. Fixes 8229.
This commit is contained in:
		
							parent
							
								
									ffbfc3c2a7
								
							
						
					
					
						commit
						61e19bee43
					
				| @ -2021,6 +2021,14 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
| 
 | ||||
|       // Section 9.10.2 Mapping Character Codes to Unicode Values
 | ||||
|       if (properties.hasIncludedToUnicodeMap) { | ||||
|         // Some fonts contain incomplete ToUnicode data, causing issues with
 | ||||
|         // text-extraction. For simple fonts, containing encoding information,
 | ||||
|         // use a fallback ToUnicode map to improve this (fixes issue8229.pdf).
 | ||||
|         if (!properties.composite && properties.hasEncoding) { | ||||
|           properties.fallbackToUnicode = | ||||
|             this._buildSimpleFontToUnicode(properties); | ||||
|         } | ||||
| 
 | ||||
|         return Promise.resolve(properties.toUnicode); | ||||
|       } | ||||
| 
 | ||||
|  | ||||
| @ -211,9 +211,9 @@ var Glyph = (function GlyphClosure() { | ||||
| })(); | ||||
| 
 | ||||
| var ToUnicodeMap = (function ToUnicodeMapClosure() { | ||||
|   function ToUnicodeMap(cmap) { | ||||
|   function ToUnicodeMap(cmap = []) { | ||||
|     // The elements of this._map can be integers or strings, depending on how
 | ||||
|     // |cmap| was created.
 | ||||
|     // `cmap` was created.
 | ||||
|     this._map = cmap; | ||||
|   } | ||||
| 
 | ||||
| @ -516,6 +516,7 @@ var Font = (function FontClosure() { | ||||
|     this.defaultEncoding = properties.defaultEncoding; | ||||
| 
 | ||||
|     this.toUnicode = properties.toUnicode; | ||||
|     this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap(); | ||||
| 
 | ||||
|     this.toFontChar = []; | ||||
| 
 | ||||
| @ -2766,7 +2767,8 @@ var Font = (function FontClosure() { | ||||
|       width = isNum(width) ? width : this.defaultWidth; | ||||
|       var vmetric = this.vmetrics && this.vmetrics[widthCode]; | ||||
| 
 | ||||
|       var unicode = this.toUnicode.get(charcode) || charcode; | ||||
|       let unicode = this.toUnicode.get(charcode) || | ||||
|         this.fallbackToUnicode.get(charcode) || charcode; | ||||
|       if (typeof unicode === 'number') { | ||||
|         unicode = String.fromCharCode(unicode); | ||||
|       } | ||||
|  | ||||
							
								
								
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -53,6 +53,7 @@ | ||||
| !issue8061.pdf | ||||
| !issue8088.pdf | ||||
| !issue8125.pdf | ||||
| !issue8229.pdf | ||||
| !issue8372.pdf | ||||
| !issue8424.pdf | ||||
| !issue8480.pdf | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								test/pdfs/issue8229.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								test/pdfs/issue8229.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -1438,6 +1438,13 @@ | ||||
|        "link": false, | ||||
|        "type": "text" | ||||
|     }, | ||||
|     {  "id": "issue8229", | ||||
|        "file": "pdfs/issue8229.pdf", | ||||
|        "md5": "a729f663782e87ebc1efad0755ebf6a5", | ||||
|        "rounds": 1, | ||||
|        "link": false, | ||||
|        "type": "text" | ||||
|     }, | ||||
|     {  "id": "ShowText-ShadingPattern", | ||||
|        "file": "pdfs/ShowText-ShadingPattern.pdf", | ||||
|        "md5": "fe683725db037ffe19d390969610a652", | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user