From 76444888fb593f32d28c4ef119405a3177ae29ba Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Fri, 14 Jan 2022 17:58:47 +0100
Subject: [PATCH] Add (basic) UTF-8 support in the `stringToPDFString` helper
 function (issue 14449)

This patch implements this by looking for the UTF-8 BOM, i.e. `\xEF\xBB\xBF`, in order to determine the encoding.[1]
The actual conversion is done using the `TextDecoder` interface, which should be available in all environments/browsers that we support; please see https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder#browser_compatibility

---
[1] Assuming that everything lacking a UTF-16 BOM would have to be UTF-8 encoded really doesn't seem correct.
---
 src/shared/util.js     | 27 ++++++++++++++++++++-------
 test/unit/util_spec.js | 17 +++++++++++++++++
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/shared/util.js b/src/shared/util.js
index 214624236..9d023bfce 100644
--- a/src/shared/util.js
+++ b/src/shared/util.js
@@ -957,26 +957,39 @@ const PDFStringTranslateTable = [
 function stringToPDFString(str) {
   const length = str.length,
     strBuf = [];
+  // UTF-16BE BOM
   if (str[0] === "\xFE" && str[1] === "\xFF") {
-    // UTF16BE BOM
     for (let i = 2; i < length; i += 2) {
       strBuf.push(
         String.fromCharCode((str.charCodeAt(i) << 8) | str.charCodeAt(i + 1))
       );
     }
-  } else if (str[0] === "\xFF" && str[1] === "\xFE") {
-    // UTF16LE BOM
+    return strBuf.join("");
+  }
+  // UTF-16LE BOM
+  if (str[0] === "\xFF" && str[1] === "\xFE") {
     for (let i = 2; i < length; i += 2) {
       strBuf.push(
         String.fromCharCode((str.charCodeAt(i + 1) << 8) | str.charCodeAt(i))
       );
     }
-  } else {
-    for (let i = 0; i < length; ++i) {
-      const code = PDFStringTranslateTable[str.charCodeAt(i)];
-      strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
+    return strBuf.join("");
+  }
+  // UTF-8 BOM
+  if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") {
+    try {
+      const decoder = new TextDecoder("utf-8", { fatal: true });
+      const buffer = stringToBytes(str);
+      return decoder.decode(buffer);
+    } catch (ex) {
+      warn(`stringToPDFString: "${ex}".`);
     }
   }
+  // ISO Latin 1
+  for (let i = 0; i < length; ++i) {
+    const code = PDFStringTranslateTable[str.charCodeAt(i)];
+    strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
+  }
   return strBuf.join("");
 }
 
diff --git a/test/unit/util_spec.js b/test/unit/util_spec.js
index 941e2542c..8f766a644 100644
--- a/test/unit/util_spec.js
+++ b/test/unit/util_spec.js
@@ -159,6 +159,19 @@ describe("util", function () {
       expect(stringToPDFString(str)).toEqual("string");
     });
 
+    it("handles UTF-8 strings", function () {
+      const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67";
+      expect(stringToPDFString(simpleStr)).toEqual("string");
+
+      const complexStr =
+        "\xEF\xBB\xBF\xE8\xA1\xA8\xE3\x83\x9D\xE3\x81\x82\x41\xE9\xB7\x97" +
+        "\xC5\x92\xC3\xA9\xEF\xBC\xA2\xE9\x80\x8D\xC3\x9C\xC3\x9F\xC2\xAA" +
+        "\xC4\x85\xC3\xB1\xE4\xB8\x82\xE3\x90\x80\xF0\xA0\x80\x80";
+      expect(stringToPDFString(complexStr)).toEqual(
+        "表ポあA鷗ŒéＢ逍Üßªąñ丂㐀𠀀"
+      );
+    });
+
     it("handles empty strings", function () {
       // ISO Latin 1
       const str1 = "";
@@ -171,6 +184,10 @@ describe("util", function () {
       // UTF-16LE
       const str3 = "\xFF\xFE";
       expect(stringToPDFString(str3)).toEqual("");
+
+      // UTF-8
+      const str4 = "\xEF\xBB\xBF";
+      expect(stringToPDFString(str4)).toEqual("");
     });
   });