Decode non-ASCII values found in the xfa:datasets (PR 14735 follow-up)

*Please note:* This is possibly bad/wrong in general, but I figured that submitting it for review wouldn't hurt.

It seems that even Adobe Reader doesn't handle the non-ASCII characters that appear in some of the fields correctly, however it should be pretty easy to improve things on the PDF.js side.
This commit is contained in:
Jonas Jenwald 2022-04-01 11:50:18 +02:00
parent b0ec83262b
commit f33ce5fc2d

View File

@ -13,9 +13,19 @@
* limitations under the License.
*/
import { stringToUTF8String, warn } from "../shared/util.js";
import { parseXFAPath } from "./core_utils.js";
import { SimpleXMLParser } from "./xml_parser.js";
function decodeString(str) {
try {
return stringToUTF8String(str);
} catch (ex) {
warn(`UTF-8 decoding failed: "${ex}".`);
return str;
}
}
class DatasetXMLParser extends SimpleXMLParser {
constructor(options) {
super(options);
@ -60,10 +70,10 @@ class DatasetReader {
const first = node.firstChild;
if (first && first.nodeName === "value") {
return node.children.map(child => child.textContent);
return node.children.map(child => decodeString(child.textContent));
}
return node.textContent;
return decodeString(node.textContent);
}
}