From f33ce5fc2d632a804281e56cee770ba25cf9dbab Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Fri, 1 Apr 2022 11:50:18 +0200
Subject: [PATCH] Decode non-ASCII values found in the xfa:datasets (PR 14735
 follow-up)

*Please note:* This is possibly bad/wrong in general, but I figured that submitting it for review wouldn't hurt.

It seems that even Adobe Reader doesn't handle the non-ASCII characters that appear in some of the fields correctly, however it should be pretty easy to improve things on the PDF.js side.
---
 src/core/dataset_reader.js | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/core/dataset_reader.js b/src/core/dataset_reader.js
index 7c0f6583c..71a9b7561 100644
--- a/src/core/dataset_reader.js
+++ b/src/core/dataset_reader.js
@@ -13,9 +13,19 @@
  * limitations under the License.
  */
 
+import { stringToUTF8String, warn } from "../shared/util.js";
 import { parseXFAPath } from "./core_utils.js";
 import { SimpleXMLParser } from "./xml_parser.js";
 
+function decodeString(str) {
+  try {
+    return stringToUTF8String(str);
+  } catch (ex) {
+    warn(`UTF-8 decoding failed: "${ex}".`);
+    return str;
+  }
+}
+
 class DatasetXMLParser extends SimpleXMLParser {
   constructor(options) {
     super(options);
@@ -60,10 +70,10 @@ class DatasetReader {
 
     const first = node.firstChild;
     if (first && first.nodeName === "value") {
-      return node.children.map(child => child.textContent);
+      return node.children.map(child => decodeString(child.textContent));
     }
 
-    return node.textContent;
+    return decodeString(node.textContent);
   }
 }