From f33ce5fc2d632a804281e56cee770ba25cf9dbab Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 1 Apr 2022 11:50:18 +0200 Subject: [PATCH] Decode non-ASCII values found in the xfa:datasets (PR 14735 follow-up) *Please note:* This is possibly bad/wrong in general, but I figured that submitting it for review wouldn't hurt. It seems that even Adobe Reader doesn't handle the non-ASCII characters that appear in some of the fields correctly, however it should be pretty easy to improve things on the PDF.js side. --- src/core/dataset_reader.js | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/core/dataset_reader.js b/src/core/dataset_reader.js index 7c0f6583c..71a9b7561 100644 --- a/src/core/dataset_reader.js +++ b/src/core/dataset_reader.js @@ -13,9 +13,19 @@ * limitations under the License. */ +import { stringToUTF8String, warn } from "../shared/util.js"; import { parseXFAPath } from "./core_utils.js"; import { SimpleXMLParser } from "./xml_parser.js"; +function decodeString(str) { + try { + return stringToUTF8String(str); + } catch (ex) { + warn(`UTF-8 decoding failed: "${ex}".`); + return str; + } +} + class DatasetXMLParser extends SimpleXMLParser { constructor(options) { super(options); @@ -60,10 +70,10 @@ class DatasetReader { const first = node.firstChild; if (first && first.nodeName === "value") { - return node.children.map(child => child.textContent); + return node.children.map(child => decodeString(child.textContent)); } - return node.textContent; + return decodeString(node.textContent); } }