When updating, write the xref table in the same format as the previous one (bug 1878916)

The specs are unclear about what kind of xref table format must be used. In checking the validity of some pdfs in the preflight tool from Acrobat we can guess that having the same format is the correct way to do. The pdf in the mentioned bug, after having been changed, wasn't correctly displayed in neither Chrome nor Acrobat: it's now fixed.
2024-02-08 10:41:51 +01:00 · 2024-02-08 10:41:51 +01:00 · 2133da166e
commit 2133da166e
parent e60329cea1
5 changed files with 204 additions and 77 deletions
--- a/src/core/core_utils.js
+++ b/src/core/core_utils.js
@ -611,6 +611,19 @@ function getRotationMatrix(rotation, width, height) {
  }
 }

+/**
+ * Get the number of bytes to use to represent the given positive integer.
+ * If n is zero, the function returns 0 which means that we don't need to waste
+ * a byte to represent it.
+ * @param {number} x - a positive integer.
+ * @returns {number}
+ */
+function getSizeInBytes(x) {
+  // n bits are required for numbers up to 2^n - 1.
+  // So for a number x, we need ceil(log2(1 + x)) bits.
+  return Math.ceil(Math.ceil(Math.log2(1 + x)) / 8);
+}
+
 export {
  arrayBuffersToBytes,
  codePointIter,
@ -622,6 +635,7 @@ export {
  getLookupTableFactory,
  getNewAnnotationsMap,
  getRotationMatrix,
+  getSizeInBytes,
  isAscii,
  isWhiteSpace,
  log2,
--- a/src/core/worker.js
+++ b/src/core/worker.js
@ -35,7 +35,7 @@ import {
  getNewAnnotationsMap,
  XRefParseException,
 } from "./core_utils.js";
-import { Dict, Ref } from "./primitives.js";
+import { Dict, isDict, Ref } from "./primitives.js";
 import { LocalPdfManager, NetworkPdfManager } from "./pdf_manager.js";
 import { AnnotationFactory } from "./annotation.js";
 import { clearGlobalCaches } from "./cleanup_helper.js";
@ -726,6 +726,8 @@ class WorkerMessageHandler {
          acroFormRef,
          acroForm,
          xfaData,
+          // Use the same kind of XRef as the previous one.
+          useXrefStream: isDict(xref.topDict, "XRef"),
        }).finally(() => {
          xref.resetNewTemporaryRef();
        });
--- a/src/core/writer.js
+++ b/src/core/writer.js
@ -18,12 +18,14 @@ import { Dict, isName, Name, Ref } from "./primitives.js";
 import {
  escapePDFName,
  escapeString,
+  getSizeInBytes,
  numberToString,
  parseXFAPath,
 } from "./core_utils.js";
 import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js";
 import { BaseStream } from "./base_stream.js";
 import { calculateMD5 } from "./crypto.js";
+import { Stream } from "./stream.js";

 async function writeObject(ref, obj, buffer, { encrypt = null }) {
  const transform = encrypt?.createCipherTransform(ref.num, ref.gen);
@ -281,6 +283,112 @@ function updateXFA({ xfaData, xfaDatasetsRef, newRefs, xref }) {
  newRefs.push({ ref: xfaDatasetsRef, data });
 }

+async function getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer) {
+  buffer.push("xref\n");
+  const indexes = getIndexes(newRefs);
+  let indexesPosition = 0;
+  for (const { ref, data } of newRefs) {
+    if (ref.num === indexes[indexesPosition]) {
+      buffer.push(
+        `${indexes[indexesPosition]} ${indexes[indexesPosition + 1]}\n`
+      );
+      indexesPosition += 2;
+    }
+    // The EOL is \r\n to make sure that every entry is exactly 20 bytes long.
+    // (see 7.5.4 - Cross-Reference Table).
+    buffer.push(
+      `${baseOffset.toString().padStart(10, "0")} ${Math.min(ref.gen, 0xffff).toString().padStart(5, "0")} n\r\n`
+    );
+    baseOffset += data.length;
+  }
+  computeIDs(baseOffset, xrefInfo, newXref);
+  buffer.push("trailer\n");
+  await writeDict(newXref, buffer);
+  buffer.push("\nstartxref\n", baseOffset.toString(), "\n%%EOF\n");
+}
+
+function getIndexes(newRefs) {
+  const indexes = [];
+  for (const { ref } of newRefs) {
+    if (ref.num === indexes.at(-2) + indexes.at(-1)) {
+      indexes[indexes.length - 1] += 1;
+    } else {
+      indexes.push(ref.num, 1);
+    }
+  }
+  return indexes;
+}
+
+async function getXRefStreamTable(
+  xrefInfo,
+  baseOffset,
+  newRefs,
+  newXref,
+  buffer
+) {
+  const xrefTableData = [];
+  let maxOffset = 0;
+  let maxGen = 0;
+  for (const { ref, data } of newRefs) {
+    maxOffset = Math.max(maxOffset, baseOffset);
+    const gen = Math.min(ref.gen, 0xffff);
+    maxGen = Math.max(maxGen, gen);
+    xrefTableData.push([1, baseOffset, gen]);
+    baseOffset += data.length;
+  }
+  newXref.set("Index", getIndexes(newRefs));
+  const offsetSize = getSizeInBytes(maxOffset);
+  const maxGenSize = getSizeInBytes(maxGen);
+  const sizes = [1, offsetSize, maxGenSize];
+  newXref.set("W", sizes);
+  computeIDs(baseOffset, xrefInfo, newXref);
+
+  const structSize = sizes.reduce((a, x) => a + x, 0);
+  const data = new Uint8Array(structSize * xrefTableData.length);
+  const stream = new Stream(data);
+  stream.dict = newXref;
+
+  let offset = 0;
+  for (const [type, objOffset, gen] of xrefTableData) {
+    offset = writeInt(type, sizes[0], offset, data);
+    offset = writeInt(objOffset, sizes[1], offset, data);
+    offset = writeInt(gen, sizes[2], offset, data);
+  }
+
+  await writeObject(xrefInfo.newRef, stream, buffer, {});
+  buffer.push("startxref\n", baseOffset.toString(), "\n%%EOF\n");
+}
+
+function computeIDs(baseOffset, xrefInfo, newXref) {
+  if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) {
+    const md5 = computeMD5(baseOffset, xrefInfo);
+    newXref.set("ID", [xrefInfo.fileIds[0], md5]);
+  }
+}
+
+function getTrailerDict(xrefInfo, newRefs, useXrefStream) {
+  const newXref = new Dict(null);
+  newXref.set("Prev", xrefInfo.startXRef);
+  const refForXrefTable = xrefInfo.newRef;
+  if (useXrefStream) {
+    newRefs.push({ ref: refForXrefTable, data: "" });
+    newXref.set("Size", refForXrefTable.num + 1);
+    newXref.set("Type", Name.get("XRef"));
+  } else {
+    newXref.set("Size", refForXrefTable.num);
+  }
+  if (xrefInfo.rootRef !== null) {
+    newXref.set("Root", xrefInfo.rootRef);
+  }
+  if (xrefInfo.infoRef !== null) {
+    newXref.set("Info", xrefInfo.infoRef);
+  }
+  if (xrefInfo.encryptRef !== null) {
+    newXref.set("Encrypt", xrefInfo.encryptRef);
+  }
+  return newXref;
+}
+
 async function incrementalUpdate({
  originalData,
  xrefInfo,
@ -293,6 +401,7 @@ async function incrementalUpdate({
  acroFormRef = null,
  acroForm = null,
  xfaData = null,
+  useXrefStream = false,
 }) {
  await updateAcroform({
    xref,
@ -314,9 +423,6 @@ async function incrementalUpdate({
    });
  }

-  const newXref = new Dict(null);
-  const refForXrefTable = xrefInfo.newRef;
-
  let buffer, baseOffset;
  const lastByte = originalData.at(-1);
  if (lastByte === /* \n */ 0x0a || lastByte === /* \r */ 0x0d) {
@ -328,60 +434,23 @@ async function incrementalUpdate({
    baseOffset = originalData.length + 1;
  }

-  newXref.set("Size", refForXrefTable.num + 1);
-  newXref.set("Prev", xrefInfo.startXRef);
-  newXref.set("Type", Name.get("XRef"));
-
-  if (xrefInfo.rootRef !== null) {
-    newXref.set("Root", xrefInfo.rootRef);
-  }
-  if (xrefInfo.infoRef !== null) {
-    newXref.set("Info", xrefInfo.infoRef);
-  }
-  if (xrefInfo.encryptRef !== null) {
-    newXref.set("Encrypt", xrefInfo.encryptRef);
-  }
-
-  // Add a ref for the new xref and sort them
-  newRefs.push({ ref: refForXrefTable, data: "" });
+  const newXref = getTrailerDict(xrefInfo, newRefs, useXrefStream);
  newRefs = newRefs.sort(
    (a, b) => /* compare the refs */ a.ref.num - b.ref.num
  );
-
-  const xrefTableData = [[0, 1, 0xffff]];
-  const indexes = [0, 1];
-  let maxOffset = 0;
-  for (const { ref, data } of newRefs) {
-    maxOffset = Math.max(maxOffset, baseOffset);
-    xrefTableData.push([1, baseOffset, Math.min(ref.gen, 0xffff)]);
-    baseOffset += data.length;
-    indexes.push(ref.num, 1);
+  for (const { data } of newRefs) {
    buffer.push(data);
  }

-  newXref.set("Index", indexes);
+  await (useXrefStream
+    ? getXRefStreamTable(xrefInfo, baseOffset, newRefs, newXref, buffer)
+    : getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer));

-  if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) {
-    const md5 = computeMD5(baseOffset, xrefInfo);
-    newXref.set("ID", [xrefInfo.fileIds[0], md5]);
-  }
-
-  const offsetSize = Math.ceil(Math.log2(maxOffset) / 8);
-  const sizes = [1, offsetSize, 2];
-  const structSize = sizes[0] + sizes[1] + sizes[2];
-  const tableLength = structSize * xrefTableData.length;
-  newXref.set("W", sizes);
-  newXref.set("Length", tableLength);
-
-  buffer.push(`${refForXrefTable.num} ${refForXrefTable.gen} obj\n`);
-  await writeDict(newXref, buffer, null);
-  buffer.push(" stream\n");
-
-  const bufferLen = buffer.reduce((a, str) => a + str.length, 0);
-  const footer = `\nendstream\nendobj\nstartxref\n${baseOffset}\n%%EOF\n`;
-  const array = new Uint8Array(
-    originalData.length + bufferLen + tableLength + footer.length
+  const totalLength = buffer.reduce(
+    (a, str) => a + str.length,
+    originalData.length
  );
+  const array = new Uint8Array(totalLength);

  // Original data
  array.set(originalData);
@ -393,16 +462,6 @@ async function incrementalUpdate({
    offset += str.length;
  }

-  // New xref table
-  for (const [type, objOffset, gen] of xrefTableData) {
-    offset = writeInt(type, sizes[0], offset, array);
-    offset = writeInt(objOffset, sizes[1], offset, array);
-    offset = writeInt(gen, sizes[2], offset, array);
-  }
-
-  // Add the footer
-  writeString(footer, offset, array);
-
  return array;
 }

--- a/test/unit/core_utils_spec.js
+++ b/test/unit/core_utils_spec.js
@ -19,6 +19,7 @@ import {
  escapePDFName,
  escapeString,
  getInheritableProperty,
+  getSizeInBytes,
  isAscii,
  isWhiteSpace,
  log2,
@ -468,4 +469,21 @@ describe("core_utils", function () {
      );
    });
  });
+
+  describe("getSizeInBytes", function () {
+    it("should get the size in bytes to use to represent a positive integer", function () {
+      expect(getSizeInBytes(0)).toEqual(0);
+      for (let i = 1; i <= 0xff; i++) {
+        expect(getSizeInBytes(i)).toEqual(1);
+      }
+
+      for (let i = 0x100; i <= 0xffff; i += 0x100) {
+        expect(getSizeInBytes(i)).toEqual(2);
+      }
+
+      for (let i = 0x10000; i <= 0xffffff; i += 0x10000) {
+        expect(getSizeInBytes(i)).toEqual(3);
+      }
+    });
+  });
 });
--- a/test/unit/writer_spec.js
+++ b/test/unit/writer_spec.js
@ -37,26 +37,55 @@ describe("Writer", function () {
        info: {},
      };

-      let data = await incrementalUpdate({ originalData, xrefInfo, newRefs });
+      let data = await incrementalUpdate({
+        originalData,
+        xrefInfo,
+        newRefs,
+        useXrefStream: true,
+      });
      data = bytesToString(data);

-      const expected =
+      let expected =
        "\nabc\n" +
        "defg\n" +
        "789 0 obj\n" +
-        "<< /Size 790 /Prev 314 /Type /XRef /Index [0 1 123 1 456 1 789 1] " +
-        "/ID [(id) (\x01#Eg\x89\xab\xcd\xef\xfe\xdc\xba\x98vT2\x10)] " +
-        "/W [1 1 2] /Length 16>> stream\n" +
-        "\x00\x01\xff\xff" +
-        "\x01\x01\x00\x2d" +
-        "\x01\x05\x00\x4e" +
-        "\x01\x0a\x00\x00\n" +
+        "<< /Prev 314 /Size 790 /Type /XRef /Index [123 1 456 1 789 1] " +
+        "/W [1 1 1] /ID [(id) (\x01#Eg\x89\xab\xcd\xef\xfe\xdc\xba\x98vT2\x10)] " +
+        "/Length 9>> stream\n" +
+        "\x01\x01\x2d" +
+        "\x01\x05\x4e" +
+        "\x01\x0a\x00\n" +
        "endstream\n" +
        "endobj\n" +
        "startxref\n" +
        "10\n" +
        "%%EOF\n";
+      expect(data).toEqual(expected);

+      data = await incrementalUpdate({
+        originalData,
+        xrefInfo,
+        newRefs,
+        useXrefStream: false,
+      });
+      data = bytesToString(data);
+
+      expected =
+        "\nabc\n" +
+        "defg\n" +
+        "xref\n" +
+        "123 1\n" +
+        "0000000001 00045 n\r\n" +
+        "456 1\n" +
+        "0000000005 00078 n\r\n" +
+        "789 1\n" +
+        "0000000010 00000 n\r\n" +
+        "trailer\n" +
+        "<< /Prev 314 /Size 789 " +
+        "/ID [(id) (\x01#Eg\x89\xab\xcd\xef\xfe\xdc\xba\x98vT2\x10)]>>\n" +
+        "startxref\n" +
+        "10\n" +
+        "%%EOF\n";
      expect(data).toEqual(expected);
    });

@ -74,17 +103,21 @@ describe("Writer", function () {
        info: {},
      };

-      let data = await incrementalUpdate({ originalData, xrefInfo, newRefs });
+      let data = await incrementalUpdate({
+        originalData,
+        xrefInfo,
+        newRefs,
+        useXrefStream: true,
+      });
      data = bytesToString(data);

      const expected =
        "\nabc\n" +
        "789 0 obj\n" +
-        "<< /Size 790 /Prev 314 /Type /XRef /Index [0 1 123 1 789 1] " +
-        "/W [1 1 2] /Length 12>> stream\n" +
-        "\x00\x01\xff\xff" +
-        "\x01\x01\x00\x2d" +
-        "\x01\x05\x00\x00\n" +
+        "<< /Prev 314 /Size 790 /Type /XRef /Index [123 1 789 1] " +
+        "/W [1 1 1] /Length 6>> stream\n" +
+        "\x01\x01\x2d" +
+        "\x01\x05\x00\n" +
        "endstream\n" +
        "endobj\n" +
        "startxref\n" +
@ -187,6 +220,7 @@ describe("Writer", function () {
        acroForm,
        xfaData,
        xref: {},
+        useXrefStream: true,
      });
      data = bytesToString(data);

@ -202,8 +236,8 @@ describe("Writer", function () {
        "endstream\n" +
        "endobj\n" +
        "131415 0 obj\n" +
-        "<< /Size 131416 /Prev 314 /Type /XRef /Index [0 1 789 1 101112 1 131415 1] /W [1 1 2] /Length 16>> stream\n" +
-        "\u0000\u0001ÿÿ\u0001\u0001\u0000\u0000\u0001[\u0000\u0000\u0001¹\u0000\u0000\n" +
+        "<< /Prev 314 /Size 131416 /Type /XRef /Index [789 1 101112 1 131415 1] /W [1 1 0] /Length 6>> stream\n" +
+        "\x01\x01\x01[\x01¹\n" +
        "endstream\n" +
        "endobj\n" +
        "startxref\n" +