From 2eaa708e3a6b0a07ee79e1312beaa436d9757343 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Wed, 16 Nov 2022 12:05:00 +0100 Subject: [PATCH 1/4] Combine the `stringToUTF16String` and `stringToUTF16BEString` helper functions Given that these functions are virtually identical, with the latter only adding a BOM, we can combine the two. Furthermore, since both functions were only used on the worker-thread, there's no reason to duplicate this functionality in both of the `pdf.js` and `pdf.worker.js` files. --- src/core/annotation.js | 16 ++++++++++++---- src/core/core_utils.js | 5 ++++- src/shared/util.js | 13 ------------- test/unit/core_utils_spec.js | 25 +++++++++++++++++++++++++ test/unit/util_spec.js | 13 ------------- 5 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/core/annotation.js b/src/core/annotation.js index 23fa29252..2582f79a0 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -34,7 +34,6 @@ import { RenderingIntentFlag, shadow, stringToPDFString, - stringToUTF16BEString, unreachable, Util, warn, @@ -1879,7 +1878,11 @@ class WidgetAnnotation extends Annotation { value, }; - const encoder = val => (isAscii(val) ? val : stringToUTF16BEString(val)); + const encoder = val => { + return isAscii(val) + ? val + : stringToUTF16String(val, /* bigEndian = */ true); + }; dict.set("V", Array.isArray(value) ? value.map(encoder) : encoder(value)); const maybeMK = this._getMKDict(rotation); @@ -3546,14 +3549,19 @@ class FreeTextAnnotation extends MarkupAnnotation { freetext.set("DA", da); freetext.set( "Contents", - isAscii(value) ? value : stringToUTF16BEString(value) + isAscii(value) + ? value + : stringToUTF16String(value, /* bigEndian = */ true) ); freetext.set("F", 4); freetext.set("Border", [0, 0, 0]); freetext.set("Rotate", rotation); if (user) { - freetext.set("T", isAscii(user) ? user : stringToUTF16BEString(user)); + freetext.set( + "T", + isAscii(user) ? user : stringToUTF16String(user, /* bigEndian = */ true) + ); } if (apRef || ap) { diff --git a/src/core/core_utils.js b/src/core/core_utils.js index f8ba471ee..6794f7768 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -584,8 +584,11 @@ function stringToUTF16HexString(str) { return buf.join(""); } -function stringToUTF16String(str) { +function stringToUTF16String(str, bigEndian = false) { const buf = []; + if (bigEndian) { + buf.push("\xFE\xFF"); + } for (let i = 0, ii = str.length; i < ii; i++) { const char = str.charCodeAt(i); buf.push( diff --git a/src/shared/util.js b/src/shared/util.js index e193268bc..4cf01b515 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -1055,18 +1055,6 @@ function isAscii(str) { return /^[\x00-\x7F]*$/.test(str); } -function stringToUTF16BEString(str) { - const buf = ["\xFE\xFF"]; - for (let i = 0, ii = str.length; i < ii; i++) { - const char = str.charCodeAt(i); - buf.push( - String.fromCharCode((char >> 8) & 0xff), - String.fromCharCode(char & 0xff) - ); - } - return buf.join(""); -} - function stringToUTF8String(str) { return decodeURIComponent(escape(str)); } @@ -1198,7 +1186,6 @@ export { string32, stringToBytes, stringToPDFString, - stringToUTF16BEString, stringToUTF8String, TextRenderingMode, UnexpectedResponseException, diff --git a/test/unit/core_utils_spec.js b/test/unit/core_utils_spec.js index cc5c9e69e..6072855f9 100644 --- a/test/unit/core_utils_spec.js +++ b/test/unit/core_utils_spec.js @@ -21,6 +21,7 @@ import { isWhiteSpace, log2, parseXFAPath, + stringToUTF16String, toRomanNumerals, validateCSSFont, } from "../../src/core/core_utils.js"; @@ -333,4 +334,28 @@ describe("core_utils", function () { expect(cssFontInfo.italicAngle).toEqual("2.718"); }); }); + + describe("stringToUTF16String", function () { + it("should encode a string in UTF16", function () { + expect(stringToUTF16String("hello world")).toEqual( + "\0h\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d" + ); + + expect(stringToUTF16String("こんにちは世界の")).toEqual( + "\x30\x53\x30\x93\x30\x6b\x30\x61\x30\x6f\x4e\x16\x75\x4c\x30\x6e" + ); + }); + + it("should encode a string in UTF16BE with a BOM", function () { + expect( + stringToUTF16String("hello world", /* bigEndian = */ true) + ).toEqual("\xfe\xff\0h\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d"); + + expect( + stringToUTF16String("こんにちは世界の", /* bigEndian = */ true) + ).toEqual( + "\xfe\xff\x30\x53\x30\x93\x30\x6b\x30\x61\x30\x6f\x4e\x16\x75\x4c\x30\x6e" + ); + }); + }); }); diff --git a/test/unit/util_spec.js b/test/unit/util_spec.js index 39874013e..3eab37e21 100644 --- a/test/unit/util_spec.js +++ b/test/unit/util_spec.js @@ -24,7 +24,6 @@ import { string32, stringToBytes, stringToPDFString, - stringToUTF16BEString, } from "../../src/shared/util.js"; describe("util", function () { @@ -270,16 +269,4 @@ describe("util", function () { ); }); }); - - describe("stringToUTF16BEString", function () { - it("should encode a string in UTF16BE with a BOM", function () { - expect(stringToUTF16BEString("hello world")).toEqual( - "\xfe\xff\0h\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d" - ); - expect(stringToUTF16BEString("こんにちは世界の")).toEqual( - "\xfe\xff\x30\x53\x30\x93\x30\x6b\x30\x61" + - "\x30\x6f\x4e\x16\x75\x4c\x30\x6e" - ); - }); - }); }); From e5859e145def0fc01a0759e7ded9bdfd54ddeaf7 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Wed, 16 Nov 2022 12:07:24 +0100 Subject: [PATCH 2/4] Move the `isAscii` helper function into the worker-thread Given that this helper function is only used on the worker-thread, there's no reason to duplicate it in both of the `pdf.js` and `pdf.worker.js` files. --- src/core/annotation.js | 2 +- src/core/core_utils.js | 5 +++++ src/shared/util.js | 5 ----- test/unit/core_utils_spec.js | 11 +++++++++++ test/unit/util_spec.js | 11 ----------- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/core/annotation.js b/src/core/annotation.js index 2582f79a0..4ee0ab1e3 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -27,7 +27,6 @@ import { FeatureTest, getModificationDate, IDENTITY_MATRIX, - isAscii, LINE_DESCENT_FACTOR, LINE_FACTOR, OPS, @@ -42,6 +41,7 @@ import { collectActions, getInheritableProperty, getRotationMatrix, + isAscii, numberToString, stringToUTF16String, } from "./core_utils.js"; diff --git a/src/core/core_utils.js b/src/core/core_utils.js index 6794f7768..e8be16014 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -572,6 +572,10 @@ function getNewAnnotationsMap(annotationStorage) { return newAnnotationsByPage.size > 0 ? newAnnotationsByPage : null; } +function isAscii(str) { + return /^[\x00-\x7F]*$/.test(str); +} + function stringToUTF16HexString(str) { const buf = []; for (let i = 0, ii = str.length; i < ii; i++) { @@ -622,6 +626,7 @@ export { getLookupTableFactory, getNewAnnotationsMap, getRotationMatrix, + isAscii, isWhiteSpace, log2, MissingDataException, diff --git a/src/shared/util.js b/src/shared/util.js index 4cf01b515..c6497750b 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -1051,10 +1051,6 @@ function escapeString(str) { }); } -function isAscii(str) { - return /^[\x00-\x7F]*$/.test(str); -} - function stringToUTF8String(str) { return decodeURIComponent(escape(str)); } @@ -1168,7 +1164,6 @@ export { InvalidPDFException, isArrayBuffer, isArrayEqual, - isAscii, LINE_DESCENT_FACTOR, LINE_FACTOR, MissingPDFException, diff --git a/test/unit/core_utils_spec.js b/test/unit/core_utils_spec.js index 6072855f9..010929d99 100644 --- a/test/unit/core_utils_spec.js +++ b/test/unit/core_utils_spec.js @@ -18,6 +18,7 @@ import { encodeToXmlString, escapePDFName, getInheritableProperty, + isAscii, isWhiteSpace, log2, parseXFAPath, @@ -335,6 +336,16 @@ describe("core_utils", function () { }); }); + describe("isAscii", function () { + it("handles ascii/non-ascii strings", function () { + expect(isAscii("hello world")).toEqual(true); + expect(isAscii("こんにちは世界の")).toEqual(false); + expect(isAscii("hello world in Japanese is こんにちは世界の")).toEqual( + false + ); + }); + }); + describe("stringToUTF16String", function () { it("should encode a string in UTF16", function () { expect(stringToUTF16String("hello world")).toEqual( diff --git a/test/unit/util_spec.js b/test/unit/util_spec.js index 3eab37e21..ed3726fde 100644 --- a/test/unit/util_spec.js +++ b/test/unit/util_spec.js @@ -20,7 +20,6 @@ import { escapeString, getModificationDate, isArrayBuffer, - isAscii, string32, stringToBytes, stringToPDFString, @@ -259,14 +258,4 @@ describe("util", function () { expect(getModificationDate(date)).toEqual("31410609020653"); }); }); - - describe("isAscii", function () { - it("handles ascii/non-ascii strings", function () { - expect(isAscii("hello world")).toEqual(true); - expect(isAscii("こんにちは世界の")).toEqual(false); - expect(isAscii("hello world in Japanese is こんにちは世界の")).toEqual( - false - ); - }); - }); }); From 9adc7859c82b3a7a87b7971d82315fcd7795ac4c Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Wed, 16 Nov 2022 12:18:13 +0100 Subject: [PATCH 3/4] Move the `escapeString` helper function into the worker-thread Given that this helper function is only used on the worker-thread, there's no reason to duplicate it in both of the `pdf.js` and `pdf.worker.js` files. --- src/core/annotation.js | 2 +- src/core/core_utils.js | 14 ++++++++++++++ src/core/writer.js | 9 +++++++-- src/shared/util.js | 15 --------------- test/unit/core_utils_spec.js | 9 +++++++++ test/unit/util_spec.js | 9 --------- 6 files changed, 31 insertions(+), 27 deletions(-) diff --git a/src/core/annotation.js b/src/core/annotation.js index 4ee0ab1e3..353537110 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -23,7 +23,6 @@ import { AnnotationType, assert, BASELINE_FACTOR, - escapeString, FeatureTest, getModificationDate, IDENTITY_MATRIX, @@ -39,6 +38,7 @@ import { } from "../shared/util.js"; import { collectActions, + escapeString, getInheritableProperty, getRotationMatrix, isAscii, diff --git a/src/core/core_utils.js b/src/core/core_utils.js index e8be16014..a4c23b7c2 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -313,6 +313,19 @@ function escapePDFName(str) { return buffer.join(""); } +// Replace "(", ")", "\n", "\r" and "\" by "\(", "\)", "\\n", "\\r" and "\\" +// in order to write it in a PDF file. +function escapeString(str) { + return str.replace(/([()\\\n\r])/g, match => { + if (match === "\n") { + return "\\n"; + } else if (match === "\r") { + return "\\r"; + } + return `\\${match}`; + }); +} + function _collectJS(entry, xref, list, parents) { if (!entry) { return; @@ -621,6 +634,7 @@ export { DocStats, encodeToXmlString, escapePDFName, + escapeString, getArrayLookupTableFactory, getInheritableProperty, getLookupTableFactory, diff --git a/src/core/writer.js b/src/core/writer.js index 1de23297b..51626f157 100644 --- a/src/core/writer.js +++ b/src/core/writer.js @@ -13,9 +13,14 @@ * limitations under the License. */ -import { bytesToString, escapeString, warn } from "../shared/util.js"; +import { bytesToString, warn } from "../shared/util.js"; import { Dict, Name, Ref } from "./primitives.js"; -import { escapePDFName, numberToString, parseXFAPath } from "./core_utils.js"; +import { + escapePDFName, + escapeString, + numberToString, + parseXFAPath, +} from "./core_utils.js"; import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js"; import { BaseStream } from "./base_stream.js"; import { calculateMD5 } from "./crypto.js"; diff --git a/src/shared/util.js b/src/shared/util.js index c6497750b..6394e383c 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -1037,20 +1037,6 @@ function stringToPDFString(str) { return strBuf.join(""); } -function escapeString(str) { - // replace "(", ")", "\n", "\r" and "\" - // by "\(", "\)", "\\n", "\\r" and "\\" - // in order to write it in a PDF file. - return str.replace(/([()\\\n\r])/g, match => { - if (match === "\n") { - return "\\n"; - } else if (match === "\r") { - return "\\r"; - } - return `\\${match}`; - }); -} - function stringToUTF8String(str) { return decodeURIComponent(escape(str)); } @@ -1151,7 +1137,6 @@ export { createPromiseCapability, createValidAbsoluteUrl, DocumentActionEventType, - escapeString, FeatureTest, FONT_IDENTITY_MATRIX, FontType, diff --git a/test/unit/core_utils_spec.js b/test/unit/core_utils_spec.js index 010929d99..7723e6f66 100644 --- a/test/unit/core_utils_spec.js +++ b/test/unit/core_utils_spec.js @@ -17,6 +17,7 @@ import { Dict, Ref } from "../../src/core/primitives.js"; import { encodeToXmlString, escapePDFName, + escapeString, getInheritableProperty, isAscii, isWhiteSpace, @@ -223,6 +224,14 @@ describe("core_utils", function () { }); }); + describe("escapeString", function () { + it("should escape (, ), \\n, \\r, and \\", function () { + expect(escapeString("((a\\a))\n(b(b\\b)\rb)")).toEqual( + "\\(\\(a\\\\a\\)\\)\\n\\(b\\(b\\\\b\\)\\rb\\)" + ); + }); + }); + describe("encodeToXmlString", function () { it("should get a correctly encoded string with some entities", function () { const str = "\"\u0397ell😂' & "; diff --git a/test/unit/util_spec.js b/test/unit/util_spec.js index ed3726fde..43ee82883 100644 --- a/test/unit/util_spec.js +++ b/test/unit/util_spec.js @@ -17,7 +17,6 @@ import { bytesToString, createPromiseCapability, createValidAbsoluteUrl, - escapeString, getModificationDate, isArrayBuffer, string32, @@ -244,14 +243,6 @@ describe("util", function () { }); }); - describe("escapeString", function () { - it("should escape (, ), \\n, \\r, and \\", function () { - expect(escapeString("((a\\a))\n(b(b\\b)\rb)")).toEqual( - "\\(\\(a\\\\a\\)\\)\\n\\(b\\(b\\\\b\\)\\rb\\)" - ); - }); - }); - describe("getModificationDate", function () { it("should get a correctly formatted date", function () { const date = new Date(Date.UTC(3141, 5, 9, 2, 6, 53)); From 7d029f8bfe803b1c628f0a79e61f1a3651bda880 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Wed, 16 Nov 2022 12:39:35 +0100 Subject: [PATCH 4/4] Add a basic `stringToUTF16HexString` unit-test --- test/unit/core_utils_spec.js | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/unit/core_utils_spec.js b/test/unit/core_utils_spec.js index 7723e6f66..1a8da5a7c 100644 --- a/test/unit/core_utils_spec.js +++ b/test/unit/core_utils_spec.js @@ -23,6 +23,7 @@ import { isWhiteSpace, log2, parseXFAPath, + stringToUTF16HexString, stringToUTF16String, toRomanNumerals, validateCSSFont, @@ -355,6 +356,18 @@ describe("core_utils", function () { }); }); + describe("stringToUTF16HexString", function () { + it("should encode a string in UTF16 hexadecimal format", function () { + expect(stringToUTF16HexString("hello world")).toEqual( + "00680065006c006c006f00200077006f0072006c0064" + ); + + expect(stringToUTF16HexString("こんにちは世界の")).toEqual( + "30533093306b3061306f4e16754c306e" + ); + }); + }); + describe("stringToUTF16String", function () { it("should encode a string in UTF16", function () { expect(stringToUTF16String("hello world")).toEqual(