Add (basic) UTF-8 support in the stringToPDFString helper function (issue 14449)
				
					
				
			This patch implements this by looking for the UTF-8 BOM, i.e. `\xEF\xBB\xBF`, in order to determine the encoding.[1] The actual conversion is done using the `TextDecoder` interface, which should be available in all environments/browsers that we support; please see https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder#browser_compatibility --- [1] Assuming that everything lacking a UTF-16 BOM would have to be UTF-8 encoded really doesn't seem correct.
This commit is contained in:
		
							parent
							
								
									ea57ef116e
								
							
						
					
					
						commit
						76444888fb
					
				| @ -957,26 +957,39 @@ const PDFStringTranslateTable = [ | ||||
| function stringToPDFString(str) { | ||||
|   const length = str.length, | ||||
|     strBuf = []; | ||||
|   // UTF-16BE BOM
 | ||||
|   if (str[0] === "\xFE" && str[1] === "\xFF") { | ||||
|     // UTF16BE BOM
 | ||||
|     for (let i = 2; i < length; i += 2) { | ||||
|       strBuf.push( | ||||
|         String.fromCharCode((str.charCodeAt(i) << 8) | str.charCodeAt(i + 1)) | ||||
|       ); | ||||
|     } | ||||
|   } else if (str[0] === "\xFF" && str[1] === "\xFE") { | ||||
|     // UTF16LE BOM
 | ||||
|     return strBuf.join(""); | ||||
|   } | ||||
|   // UTF-16LE BOM
 | ||||
|   if (str[0] === "\xFF" && str[1] === "\xFE") { | ||||
|     for (let i = 2; i < length; i += 2) { | ||||
|       strBuf.push( | ||||
|         String.fromCharCode((str.charCodeAt(i + 1) << 8) | str.charCodeAt(i)) | ||||
|       ); | ||||
|     } | ||||
|   } else { | ||||
|     for (let i = 0; i < length; ++i) { | ||||
|       const code = PDFStringTranslateTable[str.charCodeAt(i)]; | ||||
|       strBuf.push(code ? String.fromCharCode(code) : str.charAt(i)); | ||||
|     return strBuf.join(""); | ||||
|   } | ||||
|   // UTF-8 BOM
 | ||||
|   if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") { | ||||
|     try { | ||||
|       const decoder = new TextDecoder("utf-8", { fatal: true }); | ||||
|       const buffer = stringToBytes(str); | ||||
|       return decoder.decode(buffer); | ||||
|     } catch (ex) { | ||||
|       warn(`stringToPDFString: "${ex}".`); | ||||
|     } | ||||
|   } | ||||
|   // ISO Latin 1
 | ||||
|   for (let i = 0; i < length; ++i) { | ||||
|     const code = PDFStringTranslateTable[str.charCodeAt(i)]; | ||||
|     strBuf.push(code ? String.fromCharCode(code) : str.charAt(i)); | ||||
|   } | ||||
|   return strBuf.join(""); | ||||
| } | ||||
| 
 | ||||
|  | ||||
| @ -159,6 +159,19 @@ describe("util", function () { | ||||
|       expect(stringToPDFString(str)).toEqual("string"); | ||||
|     }); | ||||
| 
 | ||||
|     it("handles UTF-8 strings", function () { | ||||
|       const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67"; | ||||
|       expect(stringToPDFString(simpleStr)).toEqual("string"); | ||||
| 
 | ||||
|       const complexStr = | ||||
|         "\xEF\xBB\xBF\xE8\xA1\xA8\xE3\x83\x9D\xE3\x81\x82\x41\xE9\xB7\x97" + | ||||
|         "\xC5\x92\xC3\xA9\xEF\xBC\xA2\xE9\x80\x8D\xC3\x9C\xC3\x9F\xC2\xAA" + | ||||
|         "\xC4\x85\xC3\xB1\xE4\xB8\x82\xE3\x90\x80\xF0\xA0\x80\x80"; | ||||
|       expect(stringToPDFString(complexStr)).toEqual( | ||||
|         "表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀" | ||||
|       ); | ||||
|     }); | ||||
| 
 | ||||
|     it("handles empty strings", function () { | ||||
|       // ISO Latin 1
 | ||||
|       const str1 = ""; | ||||
| @ -171,6 +184,10 @@ describe("util", function () { | ||||
|       // UTF-16LE
 | ||||
|       const str3 = "\xFF\xFE"; | ||||
|       expect(stringToPDFString(str3)).toEqual(""); | ||||
| 
 | ||||
|       // UTF-8
 | ||||
|       const str4 = "\xEF\xBB\xBF"; | ||||
|       expect(stringToPDFString(str4)).toEqual(""); | ||||
|     }); | ||||
|   }); | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user