Add (basic) UTF-8 support in the stringToPDFString helper function (issue 14449)

This patch implements this by looking for the UTF-8 BOM, i.e. `\xEF\xBB\xBF`, in order to determine the encoding.[1]
The actual conversion is done using the `TextDecoder` interface, which should be available in all environments/browsers that we support; please see https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder#browser_compatibility

---
[1] Assuming that everything lacking a UTF-16 BOM would have to be UTF-8 encoded really doesn't seem correct.
This commit is contained in:
Jonas Jenwald 2022-01-14 17:58:47 +01:00
parent ea57ef116e
commit 76444888fb
2 changed files with 37 additions and 7 deletions

View File

@ -957,26 +957,39 @@ const PDFStringTranslateTable = [
function stringToPDFString(str) {
const length = str.length,
strBuf = [];
// UTF-16BE BOM
if (str[0] === "\xFE" && str[1] === "\xFF") {
// UTF16BE BOM
for (let i = 2; i < length; i += 2) {
strBuf.push(
String.fromCharCode((str.charCodeAt(i) << 8) | str.charCodeAt(i + 1))
);
}
} else if (str[0] === "\xFF" && str[1] === "\xFE") {
// UTF16LE BOM
return strBuf.join("");
}
// UTF-16LE BOM
if (str[0] === "\xFF" && str[1] === "\xFE") {
for (let i = 2; i < length; i += 2) {
strBuf.push(
String.fromCharCode((str.charCodeAt(i + 1) << 8) | str.charCodeAt(i))
);
}
} else {
for (let i = 0; i < length; ++i) {
const code = PDFStringTranslateTable[str.charCodeAt(i)];
strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
return strBuf.join("");
}
// UTF-8 BOM
if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") {
try {
const decoder = new TextDecoder("utf-8", { fatal: true });
const buffer = stringToBytes(str);
return decoder.decode(buffer);
} catch (ex) {
warn(`stringToPDFString: "${ex}".`);
}
}
// ISO Latin 1
for (let i = 0; i < length; ++i) {
const code = PDFStringTranslateTable[str.charCodeAt(i)];
strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
}
return strBuf.join("");
}

View File

@ -159,6 +159,19 @@ describe("util", function () {
expect(stringToPDFString(str)).toEqual("string");
});
it("handles UTF-8 strings", function () {
const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67";
expect(stringToPDFString(simpleStr)).toEqual("string");
const complexStr =
"\xEF\xBB\xBF\xE8\xA1\xA8\xE3\x83\x9D\xE3\x81\x82\x41\xE9\xB7\x97" +
"\xC5\x92\xC3\xA9\xEF\xBC\xA2\xE9\x80\x8D\xC3\x9C\xC3\x9F\xC2\xAA" +
"\xC4\x85\xC3\xB1\xE4\xB8\x82\xE3\x90\x80\xF0\xA0\x80\x80";
expect(stringToPDFString(complexStr)).toEqual(
"表ポあA鷗Œé逍Üߪąñ丂㐀𠀀"
);
});
it("handles empty strings", function () {
// ISO Latin 1
const str1 = "";
@ -171,6 +184,10 @@ describe("util", function () {
// UTF-16LE
const str3 = "\xFF\xFE";
expect(stringToPDFString(str3)).toEqual("");
// UTF-8
const str4 = "\xEF\xBB\xBF";
expect(stringToPDFString(str4)).toEqual("");
});
});