From 76444888fb593f32d28c4ef119405a3177ae29ba Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 14 Jan 2022 17:58:47 +0100 Subject: [PATCH] Add (basic) UTF-8 support in the `stringToPDFString` helper function (issue 14449) This patch implements this by looking for the UTF-8 BOM, i.e. `\xEF\xBB\xBF`, in order to determine the encoding.[1] The actual conversion is done using the `TextDecoder` interface, which should be available in all environments/browsers that we support; please see https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder#browser_compatibility --- [1] Assuming that everything lacking a UTF-16 BOM would have to be UTF-8 encoded really doesn't seem correct. --- src/shared/util.js | 27 ++++++++++++++++++++------- test/unit/util_spec.js | 17 +++++++++++++++++ 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/shared/util.js b/src/shared/util.js index 214624236..9d023bfce 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -957,26 +957,39 @@ const PDFStringTranslateTable = [ function stringToPDFString(str) { const length = str.length, strBuf = []; + // UTF-16BE BOM if (str[0] === "\xFE" && str[1] === "\xFF") { - // UTF16BE BOM for (let i = 2; i < length; i += 2) { strBuf.push( String.fromCharCode((str.charCodeAt(i) << 8) | str.charCodeAt(i + 1)) ); } - } else if (str[0] === "\xFF" && str[1] === "\xFE") { - // UTF16LE BOM + return strBuf.join(""); + } + // UTF-16LE BOM + if (str[0] === "\xFF" && str[1] === "\xFE") { for (let i = 2; i < length; i += 2) { strBuf.push( String.fromCharCode((str.charCodeAt(i + 1) << 8) | str.charCodeAt(i)) ); } - } else { - for (let i = 0; i < length; ++i) { - const code = PDFStringTranslateTable[str.charCodeAt(i)]; - strBuf.push(code ? String.fromCharCode(code) : str.charAt(i)); + return strBuf.join(""); + } + // UTF-8 BOM + if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") { + try { + const decoder = new TextDecoder("utf-8", { fatal: true }); + const buffer = stringToBytes(str); + return decoder.decode(buffer); + } catch (ex) { + warn(`stringToPDFString: "${ex}".`); } } + // ISO Latin 1 + for (let i = 0; i < length; ++i) { + const code = PDFStringTranslateTable[str.charCodeAt(i)]; + strBuf.push(code ? String.fromCharCode(code) : str.charAt(i)); + } return strBuf.join(""); } diff --git a/test/unit/util_spec.js b/test/unit/util_spec.js index 941e2542c..8f766a644 100644 --- a/test/unit/util_spec.js +++ b/test/unit/util_spec.js @@ -159,6 +159,19 @@ describe("util", function () { expect(stringToPDFString(str)).toEqual("string"); }); + it("handles UTF-8 strings", function () { + const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67"; + expect(stringToPDFString(simpleStr)).toEqual("string"); + + const complexStr = + "\xEF\xBB\xBF\xE8\xA1\xA8\xE3\x83\x9D\xE3\x81\x82\x41\xE9\xB7\x97" + + "\xC5\x92\xC3\xA9\xEF\xBC\xA2\xE9\x80\x8D\xC3\x9C\xC3\x9F\xC2\xAA" + + "\xC4\x85\xC3\xB1\xE4\xB8\x82\xE3\x90\x80\xF0\xA0\x80\x80"; + expect(stringToPDFString(complexStr)).toEqual( + "表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀" + ); + }); + it("handles empty strings", function () { // ISO Latin 1 const str1 = ""; @@ -171,6 +184,10 @@ describe("util", function () { // UTF-16LE const str3 = "\xFF\xFE"; expect(stringToPDFString(str3)).toEqual(""); + + // UTF-8 + const str4 = "\xEF\xBB\xBF"; + expect(stringToPDFString(str4)).toEqual(""); }); });