Remove language codes from text strings.
And take care to have an even number of bytes with utf16 strings.
This commit is contained in:
parent
58316369e5
commit
eb5f610d18
@ -905,12 +905,21 @@ const PDFStringTranslateTable = [
|
||||
];
|
||||
|
||||
function stringToPDFString(str) {
|
||||
// See section 7.9.2.2 Text String Type.
|
||||
// The string can contain some language codes bracketed with 0x0b,
|
||||
// so we must remove them.
|
||||
if (str[0] >= "\xEF") {
|
||||
let encoding;
|
||||
if (str[0] === "\xFE" && str[1] === "\xFF") {
|
||||
encoding = "utf-16be";
|
||||
if (str.length % 2 === 1) {
|
||||
str = str.slice(0, -1);
|
||||
}
|
||||
} else if (str[0] === "\xFF" && str[1] === "\xFE") {
|
||||
encoding = "utf-16le";
|
||||
if (str.length % 2 === 1) {
|
||||
str = str.slice(0, -1);
|
||||
}
|
||||
} else if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") {
|
||||
encoding = "utf-8";
|
||||
}
|
||||
@ -919,7 +928,11 @@ function stringToPDFString(str) {
|
||||
try {
|
||||
const decoder = new TextDecoder(encoding, { fatal: true });
|
||||
const buffer = stringToBytes(str);
|
||||
return decoder.decode(buffer);
|
||||
const decoded = decoder.decode(buffer);
|
||||
if (!decoded.includes("\x1b")) {
|
||||
return decoded;
|
||||
}
|
||||
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
|
||||
} catch (ex) {
|
||||
warn(`stringToPDFString: "${ex}".`);
|
||||
}
|
||||
@ -928,7 +941,13 @@ function stringToPDFString(str) {
|
||||
// ISO Latin 1
|
||||
const strBuf = [];
|
||||
for (let i = 0, ii = str.length; i < ii; i++) {
|
||||
const code = PDFStringTranslateTable[str.charCodeAt(i)];
|
||||
const charCode = str.charCodeAt(i);
|
||||
if (charCode === 0x1b) {
|
||||
// eslint-disable-next-line no-empty
|
||||
while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
|
||||
continue;
|
||||
}
|
||||
const code = PDFStringTranslateTable[charCode];
|
||||
strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
|
||||
}
|
||||
return strBuf.join("");
|
||||
|
@ -99,11 +99,21 @@ describe("util", function () {
|
||||
expect(stringToPDFString(str)).toEqual("string");
|
||||
});
|
||||
|
||||
it("handles incomplete UTF-16 big-endian strings", function () {
|
||||
const str = "\xFE\xFF\x00\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00";
|
||||
expect(stringToPDFString(str)).toEqual("strin");
|
||||
});
|
||||
|
||||
it("handles UTF-16 little-endian strings", function () {
|
||||
const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67\x00";
|
||||
expect(stringToPDFString(str)).toEqual("string");
|
||||
});
|
||||
|
||||
it("handles incomplete UTF-16 little-endian strings", function () {
|
||||
const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67";
|
||||
expect(stringToPDFString(str)).toEqual("strin");
|
||||
});
|
||||
|
||||
it("handles UTF-8 strings", function () {
|
||||
const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67";
|
||||
expect(stringToPDFString(simpleStr)).toEqual("string");
|
||||
@ -134,6 +144,22 @@ describe("util", function () {
|
||||
const str4 = "\xEF\xBB\xBF";
|
||||
expect(stringToPDFString(str4)).toEqual("");
|
||||
});
|
||||
|
||||
it("handles strings with language code", function () {
|
||||
// ISO Latin 1
|
||||
const str1 = "hello \x1benUS\x1bworld";
|
||||
expect(stringToPDFString(str1)).toEqual("hello world");
|
||||
|
||||
// UTF-16BE
|
||||
const str2 =
|
||||
"\xFE\xFF\x00h\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d";
|
||||
expect(stringToPDFString(str2)).toEqual("hello world");
|
||||
|
||||
// UTF-16LE
|
||||
const str3 =
|
||||
"\xFF\xFEh\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d\x00";
|
||||
expect(stringToPDFString(str3)).toEqual("hello world");
|
||||
});
|
||||
});
|
||||
|
||||
describe("ReadableStream", function () {
|
||||
|
Loading…
x
Reference in New Issue
Block a user