From 40b9be137f276ab54598aad8726c982ed6d58075 Mon Sep 17 00:00:00 2001 From: Nils Maier Date: Sun, 27 May 2012 22:49:28 +0200 Subject: [PATCH] Decode XML metadata as UTF-8 XML uses UTF-8 by default, which needs to be decoded to a Javascript String prior to feeding it to the DOMParser. In an ideal world, the XML would actually be analyzed and the specified charset would be used, however that does not seem feasible unless JS engines get iconv bindings. Fixes GH-1692 --- src/obj.js | 7 ++++++- src/util.js | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/obj.js b/src/obj.js index 3432ac68d..acc9e1284 100644 --- a/src/obj.js +++ b/src/obj.js @@ -140,7 +140,12 @@ var Catalog = (function CatalogClosure() { if (isName(type) && isName(subtype) && type.name === 'Metadata' && subtype.name === 'XML') { - metadata = stringToPDFString(bytesToString(stream.getBytes())); + // XXX: This should examine the charset the XML document defines, + // however since there are currently no real means to decode + // arbitrary charsets, let's just hope that the author of the PDF + // was reasonable enough to stick with the XML default charset, + // which is UTF-8. + metadata = stringToUTF8String(bytesToString(stream.getBytes())); } } diff --git a/src/util.js b/src/util.js index 90e6cee5d..fe5d895e3 100644 --- a/src/util.js +++ b/src/util.js @@ -302,6 +302,10 @@ function stringToPDFString(str) { return str2; } +function stringToUTF8String(str) { + return decodeURIComponent(escape(str)); +} + function isBool(v) { return typeof v == 'boolean'; }