From 248653684378ce118f152d260c2f7ab46b6aec57 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Thu, 27 Apr 2023 21:50:27 +0200 Subject: [PATCH] Compress the data when saving annotions CompressionStream API has been added in Firefox 113 (see https://bugzilla.mozilla.org/show_bug.cgi?id=1823619) hence we can use it to compress the streams with added/modified annotations. --- src/core/annotation.js | 16 +++---- src/core/document.js | 2 +- src/core/writer.js | 89 +++++++++++++++++++++++++++-------- test/pdfs/.gitignore | 1 + test/pdfs/multiline.pdf | Bin 0 -> 6455 bytes test/test_manifest.json | 14 ++++++ test/unit/annotation_spec.js | 52 ++++++++++++++++++++ test/unit/writer_spec.js | 20 ++++---- 8 files changed, 156 insertions(+), 38 deletions(-) create mode 100755 test/pdfs/multiline.pdf diff --git a/src/core/annotation.js b/src/core/annotation.js index 9521ec53c..e642b3d83 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -273,7 +273,7 @@ class AnnotationFactory { baseFont.set("Encoding", Name.get("WinAnsiEncoding")); const buffer = []; baseFontRef = xref.getNewTemporaryRef(); - writeObject(baseFontRef, baseFont, buffer, null); + await writeObject(baseFontRef, baseFont, buffer, null); dependencies.push({ ref: baseFontRef, data: buffer.join("") }); } promises.push( @@ -1479,7 +1479,7 @@ class MarkupAnnotation extends Annotation { const transform = xref.encrypt ? xref.encrypt.createCipherTransform(apRef.num, apRef.gen) : null; - writeObject(apRef, ap, buffer, transform); + await writeObject(apRef, ap, buffer, transform); dependencies.push({ ref: apRef, data: buffer.join("") }); } else { annotationDict = this.createNewDict(annotation, xref, {}); @@ -1489,7 +1489,7 @@ class MarkupAnnotation extends Annotation { const transform = xref.encrypt ? xref.encrypt.createCipherTransform(annotationRef.num, annotationRef.gen) : null; - writeObject(annotationRef, annotationDict, buffer, transform); + await writeObject(annotationRef, annotationDict, buffer, transform); return { ref: annotationRef, data: buffer.join("") }; } @@ -1922,7 +1922,7 @@ class WidgetAnnotation extends Annotation { appearanceDict.set("Matrix", rotationMatrix); } - writeObject(newRef, appearanceStream, buffer, newTransform); + await writeObject(newRef, appearanceStream, buffer, newTransform); changes.push( // data for the new AP @@ -1937,7 +1937,7 @@ class WidgetAnnotation extends Annotation { } dict.set("M", `D:${getModificationDate()}`); - writeObject(this.ref, dict, buffer, originalTransform); + await writeObject(this.ref, dict, buffer, originalTransform); changes[0].data = buffer.join(""); @@ -2814,7 +2814,7 @@ class ButtonWidgetAnnotation extends WidgetAnnotation { } const buffer = [`${this.ref.num} ${this.ref.gen} obj\n`]; - writeDict(dict, buffer, originalTransform); + await writeDict(dict, buffer, originalTransform); buffer.push("\nendobj\n"); return [{ ref: this.ref, data: buffer.join(""), xfa }]; @@ -2873,7 +2873,7 @@ class ButtonWidgetAnnotation extends WidgetAnnotation { } parent.set("V", name); parentBuffer = [`${this.parent.num} ${this.parent.gen} obj\n`]; - writeDict(parent, parentBuffer, parentTransform); + await writeDict(parent, parentBuffer, parentTransform); parentBuffer.push("\nendobj\n"); } else if (this.parent instanceof Dict) { this.parent.set("V", name); @@ -2897,7 +2897,7 @@ class ButtonWidgetAnnotation extends WidgetAnnotation { } const buffer = [`${this.ref.num} ${this.ref.gen} obj\n`]; - writeDict(dict, buffer, originalTransform); + await writeDict(dict, buffer, originalTransform); buffer.push("\nendobj\n"); const newRefs = [{ ref: this.ref, data: buffer.join(""), xfa }]; diff --git a/src/core/document.js b/src/core/document.js index f36e77d50..be6f0d728 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -297,7 +297,7 @@ class Page { ); } - writeObject(this.ref, pageDict, buffer, transform); + await writeObject(this.ref, pageDict, buffer, transform); if (savedDict) { pageDict.set("Annots", savedDict); } diff --git a/src/core/writer.js b/src/core/writer.js index 2f3e629c3..6c30acf64 100644 --- a/src/core/writer.js +++ b/src/core/writer.js @@ -13,7 +13,7 @@ * limitations under the License. */ -import { bytesToString, warn } from "../shared/util.js"; +import { bytesToString, info, stringToBytes, warn } from "../shared/util.js"; import { Dict, Name, Ref } from "./primitives.js"; import { escapePDFName, @@ -25,36 +25,87 @@ import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js"; import { BaseStream } from "./base_stream.js"; import { calculateMD5 } from "./crypto.js"; -function writeObject(ref, obj, buffer, transform) { +async function writeObject(ref, obj, buffer, transform) { buffer.push(`${ref.num} ${ref.gen} obj\n`); if (obj instanceof Dict) { - writeDict(obj, buffer, transform); + await writeDict(obj, buffer, transform); } else if (obj instanceof BaseStream) { - writeStream(obj, buffer, transform); + await writeStream(obj, buffer, transform); } buffer.push("\nendobj\n"); } -function writeDict(dict, buffer, transform) { +async function writeDict(dict, buffer, transform) { buffer.push("<<"); for (const key of dict.getKeys()) { buffer.push(` /${escapePDFName(key)} `); - writeValue(dict.getRaw(key), buffer, transform); + await writeValue(dict.getRaw(key), buffer, transform); } buffer.push(">>"); } -function writeStream(stream, buffer, transform) { +async function writeStream(stream, buffer, transform) { let string = stream.getString(); if (transform !== null) { string = transform.encryptString(string); } + + // eslint-disable-next-line no-undef + if (typeof CompressionStream === "undefined") { + stream.dict.set("Length", string.length); + await writeDict(stream.dict, buffer, transform); + buffer.push(" stream\n", string, "\nendstream"); + return; + } + + const filter = await stream.dict.getAsync("Filter"); + const flateDecode = Name.get("FlateDecode"); + + // If the string is too small there is no real benefit + // in compressing it. + // The number 256 is arbitrary, but it should be reasonable. + const MIN_LENGTH_FOR_COMPRESSING = 256; + + if ( + string.length >= MIN_LENGTH_FOR_COMPRESSING || + (Array.isArray(filter) && filter.includes(flateDecode)) || + (filter instanceof Name && filter.name === flateDecode.name) + ) { + try { + const byteArray = stringToBytes(string); + // eslint-disable-next-line no-undef + const cs = new CompressionStream("deflate"); + const writer = cs.writable.getWriter(); + writer.write(byteArray); + writer.close(); + + // Response::text doesn't return the correct data. + const buf = await new Response(cs.readable).arrayBuffer(); + string = bytesToString(new Uint8Array(buf)); + + if (Array.isArray(filter)) { + if (!filter.includes(flateDecode)) { + filter.push(flateDecode); + } + } else if (!filter) { + stream.dict.set("Filter", flateDecode); + } else if ( + !(filter instanceof Name) || + filter.name !== flateDecode.name + ) { + stream.dict.set("Filter", [filter, flateDecode]); + } + } catch (ex) { + info(`writeStream - cannot compress data: "${ex}".`); + } + } + stream.dict.set("Length", string.length); - writeDict(stream.dict, buffer, transform); + await writeDict(stream.dict, buffer, transform); buffer.push(" stream\n", string, "\nendstream"); } -function writeArray(array, buffer, transform) { +async function writeArray(array, buffer, transform) { buffer.push("["); let first = true; for (const val of array) { @@ -63,18 +114,18 @@ function writeArray(array, buffer, transform) { } else { first = false; } - writeValue(val, buffer, transform); + await writeValue(val, buffer, transform); } buffer.push("]"); } -function writeValue(value, buffer, transform) { +async function writeValue(value, buffer, transform) { if (value instanceof Name) { buffer.push(`/${escapePDFName(value.name)}`); } else if (value instanceof Ref) { buffer.push(`${value.num} ${value.gen} R`); } else if (Array.isArray(value)) { - writeArray(value, buffer, transform); + await writeArray(value, buffer, transform); } else if (typeof value === "string") { if (transform !== null) { value = transform.encryptString(value); @@ -85,9 +136,9 @@ function writeValue(value, buffer, transform) { } else if (typeof value === "boolean") { buffer.push(value.toString()); } else if (value instanceof Dict) { - writeDict(value, buffer, transform); + await writeDict(value, buffer, transform); } else if (value instanceof BaseStream) { - writeStream(value, buffer, transform); + await writeStream(value, buffer, transform); } else if (value === null) { buffer.push("null"); } else { @@ -160,7 +211,7 @@ function writeXFADataForAcroform(str, newRefs) { return buffer.join(""); } -function updateAcroform({ +async function updateAcroform({ xref, acroForm, acroFormRef, @@ -206,7 +257,7 @@ function updateAcroform({ } const buffer = []; - writeObject(acroFormRef, dict, buffer, transform); + await writeObject(acroFormRef, dict, buffer, transform); newRefs.push({ ref: acroFormRef, data: buffer.join("") }); } @@ -234,7 +285,7 @@ function updateXFA({ xfaData, xfaDatasetsRef, newRefs, xref }) { newRefs.push({ ref: xfaDatasetsRef, data }); } -function incrementalUpdate({ +async function incrementalUpdate({ originalData, xrefInfo, newRefs, @@ -247,7 +298,7 @@ function incrementalUpdate({ acroForm = null, xfaData = null, }) { - updateAcroform({ + await updateAcroform({ xref, acroForm, acroFormRef, @@ -328,7 +379,7 @@ function incrementalUpdate({ newXref.set("Length", tableLength); buffer.push(`${refForXrefTable.num} ${refForXrefTable.gen} obj\n`); - writeDict(newXref, buffer, null); + await writeDict(newXref, buffer, null); buffer.push(" stream\n"); const bufferLen = buffer.reduce((a, str) => a + str.length, 0); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index e8becca1e..98d799d1b 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -590,3 +590,4 @@ !copy_paste_ligatures.pdf !issue16316.pdf !issue14565.pdf +!multiline.pdf diff --git a/test/pdfs/multiline.pdf b/test/pdfs/multiline.pdf new file mode 100755 index 0000000000000000000000000000000000000000..762b4a78ed8a7f174fe929e7817a0e0234118e09 GIT binary patch literal 6455 zcmeHMYgiLk8b$eXxmrN!XlL!P-K#*%irD!I}1R_Z$PKKM8D%N{# zrPS7{XsxzteR|QhsI63C3wDd5sNl7wE^ezuT(P?KvbOG-5R%xS>yQ4}ALb!>&dm9~ z@4Vl8-gD;6$LREUod^{Zg3;|4j-C+kP$`5!v|*+|rGir|c8X*yvndmV!YL4eOQmoc zgv#Iq2*sr`coMjs4G}1=fD@ry2*(u=N)UM(jexS7xEN5c9|4AX$B3tlw24Y58JiP~ zsHLq%HoFtT;dF*FS&S^rKq#isz&eYSr5IReC0VN51x||3RmH{XbU3ENwDA%IM^Hqe zl_%)rG6k+c;;;lcrcr6JxCD7ZLTsD_)8bMsqL9mC<#Hre&bgp*I!zv&WH(Wzxgc>A z_VI>Qm8_x^V2OO9a`XB!cT#UJ86>DT@9nIJv40F+#O0lGg zWJyTkx=br#tw4BANangsC-XrY?e23itp&DLI^QcZl(Xy!Znx8{6T-UfwX~h3?A$ft z3Z`XfM;u+6tB{Jp?8_irN{D3uh7=*h3JC$pFojs5AoAc;pgxkbM8v=odhH%>s==NVh6D^gcl4UZ8Ql(IqGVgi8Qyxfqwo^I(o2 zE?I8jb*oUKfKy49u>hsIA#t#L+Ig@K5TIDvm_@O?rQS~FWmY~ZBDN(gpSt^TP7?X|9LBKn= z{c+{0-&g)CblUgUuAehr+*9&Vz-=FONYO` z&))f9uJInOQ4D_=*H{jczB3)!l^qcGOz`mG^Df>!UK1Xyoww)k=JxPStIOjm_NPTu z2{Wfw?6^8(TICyS*G#kK_iH&H+QO^+OJa0#a0k9L@=)}*Nv)2?q{D|MhbE?^wFX5M zEhiQwJ|C~^V{vYJwd{R`V%MFzJO8>p>UME4x5@1oJT}%pF>W((^YGxrR~=p398fr?VCmBZ%XcqsoV;U|^~{M?g9n}3me*Nv z_g>-(!;9h-2^nuSH0_97lkml~r%Fn?gnj1~X&2F^RZqSew>{*?&xL0*sv}>j?fdKM zue;vvC{>23u5CG-HuOxBdE=$01~jQJNDo4tQEjL0QCCw>RBVgQxb;QN!m}q{k9oJG zZ{&p+I)+c2AG2>+^1wBhjyG>k`>Nr|x@!9-S=WoLz%PY2tNAe znQLVy)5jK$TX=By==m@Gd^T`uVgI_IP1B8+7A(0t^TP}E&86AL9OG>mv8}qLe!!vi zyI*NuKPp&Wv1-G_KK(>C$DMUcB|_m>L*vYszn4tFC22q1xn_FIn>80k=4`4Q`N{eEgVGu0bGE#z zJ9={1*80Y$CmpZ4{^cmmPGRZCvWYuC{_*==FTc{YeB`=eT`lWsmyQyscYGICy)m>u z5xn+=yI11c%Y_$575{a8boRLL8y`fTiu$`oJ$>Ght%<`62i|HMtG;={nEvfgOHN)! zZ*6RrpucxE6Pf?$5C#d>CU*_-Lb_vrX(=K><5lZWO0;Qh!)0&-?DcKi^<)|7s)d1q}@t(|F}we_eA= zqWjmkL?XdG8}gLYl1%`GE>q-}61>43qL5$?^&Wc2tr ziWsX4n#l-LR?5bm6o4Pvs~_MrHPz$V>u@$(ubvJcKJN1a9bh;~uTG??fHszRiM-E| zJ3$#pHkPx=p_xLnl`iQ{*fk&BG^qe$Wh!m5n9Jh9iA2LqqX?0LzibRsViKhUk3ujd zf~ep>SbC{;#n6J2EI4)d5(8BtGT#_JmVb!RBkta!_#tth2VJ&*h~pZ8X0mD8s&TUb zfujyMq=->5hM>5lI~!j+pAaUa(hLqjBE4y(mTax#O#Jg8uAmyf+`(v|VFv4|5# zQoN?9$YN3&Wdx#-nMsjJA*V!A1SdrXoHB`Is2Me)n3q9c$CItx*mlkkIiGcJXAbN<1t;2!ji}4VRZ4l zuRNgisg5WCFEsvdR9toTe^m;kJ%s{Lvvt*}x#CxvU$F_pCxZ&eu45%f;HIbyq~+$3 zlyR>Sct2i9L&wtW!S8kSdv@!#!ROBO0k3x#3!iB72lMdqYj^F-0|rI7ss*T;KL|}m zHyUwrYV@~8BQLZ-kFKQpAmE$bZftq!j}eLBb# zQvb?3Ds>k!A6$742ppq3`6@q!zfx8nbX@?|4G(Jnb>XrtF=vJxD31;a(G9J;HDK|f cpmq!k05hDtI&_RbIo|3!ih}=>xa#l!1!#um{{R30 literal 0 HcmV?d00001 diff --git a/test/test_manifest.json b/test/test_manifest.json index e9cfceab0..8951d3d23 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -7580,5 +7580,19 @@ "rounds": 1, "annotations": true, "type": "eq" + }, + { + "id": "multiline_compress", + "file": "pdfs/multiline.pdf", + "md5": "4727c7d1e4e5c7d45fded8ab7a2e05e5", + "rounds": 1, + "type": "eq", + "save": true, + "print": true, + "annotationStorage": { + "24R": { + "value": "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\nabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz" + } + } } ] diff --git a/test/unit/annotation_spec.js b/test/unit/annotation_spec.js index b90d18a83..615de6ddd 100644 --- a/test/unit/annotation_spec.js +++ b/test/unit/annotation_spec.js @@ -2115,6 +2115,58 @@ describe("annotation", function () { ); }); + it("should compress and save text", async function () { + const textWidgetRef = Ref.get(123, 0); + const xref = new XRefMock([ + { ref: textWidgetRef, data: textWidgetDict }, + helvRefObj, + ]); + partialEvaluator.xref = xref; + const task = new WorkerTask("test save"); + + const annotation = await AnnotationFactory.create( + xref, + textWidgetRef, + pdfManagerMock, + idFactoryMock + ); + const annotationStorage = new Map(); + const value = "a".repeat(256); + annotationStorage.set(annotation.data.id, { value }); + + const data = await annotation.save( + partialEvaluator, + task, + annotationStorage + ); + expect(data.length).toEqual(2); + const [oldData, newData] = data; + expect(oldData.ref).toEqual(Ref.get(123, 0)); + expect(newData.ref).toEqual(Ref.get(2, 0)); + + oldData.data = oldData.data.replace(/\(D:\d+\)/, "(date)"); + expect(oldData.data).toEqual( + "123 0 obj\n" + + "<< /Type /Annot /Subtype /Widget /FT /Tx /DA (/Helv 5 Tf) /DR " + + "<< /Font << /Helv 314 0 R>>>> /Rect [0 0 32 10] " + + `/V (${value}) /AP << /N 2 0 R>> /M (date)>>\nendobj\n` + ); + + const compressedData = [ + 120, 156, 211, 15, 169, 80, 112, 242, 117, 86, 40, 84, 112, 10, 81, 208, + 247, 72, 205, 41, 83, 48, 85, 8, 73, 83, 48, 84, 48, 0, 66, 8, 25, 146, + 171, 96, 164, 96, 172, 103, 96, 174, 16, 146, 162, 160, 145, 56, 194, + 129, 166, 66, 72, 150, 130, 107, 136, 66, 160, 130, 171, 175, 51, 0, + 222, 235, 111, 133, + ]; + const compressedStream = String.fromCharCode(...compressedData); + expect(newData.data).toEqual( + "2 0 obj\n<< /Subtype /Form /Resources " + + "<< /Font << /Helv 314 0 R>>>> /BBox [0 0 32 10] /Filter /FlateDecode /Length 68>> stream\n" + + `${compressedStream}\nendstream\nendobj\n` + ); + }); + it("should get field object for usage in JS sandbox", async function () { const textWidgetRef = Ref.get(123, 0); const xDictRef = Ref.get(141, 0); diff --git a/test/unit/writer_spec.js b/test/unit/writer_spec.js index c201c6dfc..6d3101277 100644 --- a/test/unit/writer_spec.js +++ b/test/unit/writer_spec.js @@ -20,7 +20,7 @@ import { StringStream } from "../../src/core/stream.js"; describe("Writer", function () { describe("Incremental update", function () { - it("should update a file with new objects", function () { + it("should update a file with new objects", async function () { const originalData = new Uint8Array(); const newRefs = [ { ref: Ref.get(123, 0x2d), data: "abc\n" }, @@ -37,7 +37,7 @@ describe("Writer", function () { info: {}, }; - let data = incrementalUpdate({ originalData, xrefInfo, newRefs }); + let data = await incrementalUpdate({ originalData, xrefInfo, newRefs }); data = bytesToString(data); const expected = @@ -60,7 +60,7 @@ describe("Writer", function () { expect(data).toEqual(expected); }); - it("should update a file, missing the /ID-entry, with new objects", function () { + it("should update a file, missing the /ID-entry, with new objects", async function () { const originalData = new Uint8Array(); const newRefs = [{ ref: Ref.get(123, 0x2d), data: "abc\n" }]; const xrefInfo = { @@ -74,7 +74,7 @@ describe("Writer", function () { info: {}, }; - let data = incrementalUpdate({ originalData, xrefInfo, newRefs }); + let data = await incrementalUpdate({ originalData, xrefInfo, newRefs }); data = bytesToString(data); const expected = @@ -96,7 +96,7 @@ describe("Writer", function () { }); describe("writeDict", function () { - it("should write a Dict", function () { + it("should write a Dict", async function () { const dict = new Dict(null); dict.set("A", Name.get("B")); dict.set("B", Ref.get(123, 456)); @@ -121,7 +121,7 @@ describe("Writer", function () { dict.set("NullVal", null); const buffer = []; - writeDict(dict, buffer, null); + await writeDict(dict, buffer, null); const expected = "<< /A /B /B 123 456 R /C 789 /D (hello world) " + @@ -134,14 +134,14 @@ describe("Writer", function () { expect(buffer.join("")).toEqual(expected); }); - it("should write a Dict in escaping PDF names", function () { + it("should write a Dict in escaping PDF names", async function () { const dict = new Dict(null); dict.set("\xfeA#", Name.get("hello")); dict.set("B", Name.get("#hello")); dict.set("C", Name.get("he\xfello\xff")); const buffer = []; - writeDict(dict, buffer, null); + await writeDict(dict, buffer, null); const expected = "<< /#feA#23 /hello /B /#23hello /C /he#fello#ff>>"; @@ -150,7 +150,7 @@ describe("Writer", function () { }); describe("XFA", function () { - it("should update AcroForm when no datasets in XFA array", function () { + it("should update AcroForm when no datasets in XFA array", async function () { const originalData = new Uint8Array(); const newRefs = []; @@ -176,7 +176,7 @@ describe("Writer", function () { info: {}, }; - let data = incrementalUpdate({ + let data = await incrementalUpdate({ originalData, xrefInfo, newRefs,