Decode non-ASCII values found in the xfa:datasets (PR 14735 follow-up)

*Please note:* This is possibly bad/wrong in general, but I figured that submitting it for review wouldn't hurt. It seems that even Adobe Reader doesn't handle the non-ASCII characters that appear in some of the fields correctly, however it should be pretty easy to improve things on the PDF.js side.
2022-04-01 11:50:18 +02:00 · 2022-04-01 11:50:18 +02:00 · f33ce5fc2d
commit f33ce5fc2d
parent b0ec83262b
1 changed files with 12 additions and 2 deletions
--- a/src/core/dataset_reader.js
+++ b/src/core/dataset_reader.js
@ -13,9 +13,19 @@
 * limitations under the License.
 */

+import { stringToUTF8String, warn } from "../shared/util.js";
 import { parseXFAPath } from "./core_utils.js";
 import { SimpleXMLParser } from "./xml_parser.js";

+function decodeString(str) {
+  try {
+    return stringToUTF8String(str);
+  } catch (ex) {
+    warn(`UTF-8 decoding failed: "${ex}".`);
+    return str;
+  }
+}
+
 class DatasetXMLParser extends SimpleXMLParser {
  constructor(options) {
    super(options);
@ -60,10 +70,10 @@ class DatasetReader {

    const first = node.firstChild;
    if (first && first.nodeName === "value") {
-      return node.children.map(child => child.textContent);
+      return node.children.map(child => decodeString(child.textContent));
    }

-    return node.textContent;
+    return decodeString(node.textContent);
  }
 }