Merge pull request #14340 from Snuffleupagus/Metadata-fetch-error

Handle errors when fetching the raw /Metadata (issue 14305)
2021-12-04 13:19:37 +01:00 · 2021-12-04 13:19:37 +01:00 · 3117985c55
commit 3117985c55
parent bceed26e67 40291d1943
4 changed files with 3624 additions and 19 deletions
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@ -50,6 +50,7 @@ import {
  warn,
 } from "../shared/util.js";
 import { NameTree, NumberTree } from "./name_number_tree.js";
+import { BaseStream } from "./base_stream.js";
 import { ColorSpace } from "./colorspace.js";
 import { FileSpec } from "./file_spec.js";
 import { GlobalImageCache } from "./image_utils.js";
@ -153,37 +154,37 @@ class Catalog {

  get metadata() {
    const streamRef = this._catDict.getRaw("Metadata");
-    if (!isRef(streamRef)) {
+    if (!(streamRef instanceof Ref)) {
      return shadow(this, "metadata", null);
    }

-    const suppressEncryption = !(
-      this.xref.encrypt && this.xref.encrypt.encryptMetadata
-    );
-    const stream = this.xref.fetch(streamRef, suppressEncryption);
    let metadata = null;
+    try {
+      const suppressEncryption = !(
+        this.xref.encrypt && this.xref.encrypt.encryptMetadata
+      );
+      const stream = this.xref.fetch(streamRef, suppressEncryption);

-    if (isStream(stream) && isDict(stream.dict)) {
-      const type = stream.dict.get("Type");
-      const subtype = stream.dict.get("Subtype");
+      if (stream instanceof BaseStream && stream.dict instanceof Dict) {
+        const type = stream.dict.get("Type");
+        const subtype = stream.dict.get("Subtype");

-      if (isName(type, "Metadata") && isName(subtype, "XML")) {
-        // XXX: This should examine the charset the XML document defines,
-        // however since there are currently no real means to decode arbitrary
-        // charsets, let's just hope that the author of the PDF was reasonable
-        // enough to stick with the XML default charset, which is UTF-8.
-        try {
+        if (isName(type, "Metadata") && isName(subtype, "XML")) {
+          // XXX: This should examine the charset the XML document defines,
+          // however since there are currently no real means to decode arbitrary
+          // charsets, let's just hope that the author of the PDF was reasonable
+          // enough to stick with the XML default charset, which is UTF-8.
          const data = stringToUTF8String(stream.getString());
          if (data) {
            metadata = new MetadataParser(data).serializable;
          }
-        } catch (e) {
-          if (e instanceof MissingDataException) {
-            throw e;
-          }
-          info("Skipping invalid metadata.");
        }
      }
+    } catch (ex) {
+      if (ex instanceof MissingDataException) {
+        throw ex;
+      }
+      info(`Skipping invalid Metadata: "${ex}".`);
    }
    return shadow(this, "metadata", metadata);
  }
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -498,3 +498,4 @@
 !poppler-91414-0-54.pdf
 !poppler-742-0-fuzzed.pdf
 !poppler-937-0-fuzzed.pdf
+!PDFBOX-3148-2-fuzzed.pdf
--- a/test/pdfs/PDFBOX-3148-2-fuzzed.pdf
+++ b/test/pdfs/PDFBOX-3148-2-fuzzed.pdf
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -1439,6 +1439,8 @@ describe("api", function () {
      const { info, metadata, contentDispositionFilename, contentLength } =
        await pdfDoc.getMetadata();

+      // Custom, non-standard, information dictionary entries.
+      expect(info.Custom).toEqual(undefined);
      // The following are PDF.js specific, non-standard, properties.
      expect(info.PDFFormatVersion).toEqual(null);
      expect(info.Language).toEqual(null);
@ -1456,6 +1458,33 @@ describe("api", function () {
      await loadingTask.destroy();
    });

+    it("gets metadata, with corrupt /Metadata XRef entry", async function () {
+      const loadingTask = getDocument(
+        buildGetDocumentParams("PDFBOX-3148-2-fuzzed.pdf")
+      );
+      const pdfDoc = await loadingTask.promise;
+      const { info, metadata, contentDispositionFilename, contentLength } =
+        await pdfDoc.getMetadata();
+
+      // Custom, non-standard, information dictionary entries.
+      expect(info.Custom).toEqual(undefined);
+      // The following are PDF.js specific, non-standard, properties.
+      expect(info.PDFFormatVersion).toEqual("1.6");
+      expect(info.Language).toEqual(null);
+      expect(info.EncryptFilterName).toEqual(null);
+      expect(info.IsLinearized).toEqual(false);
+      expect(info.IsAcroFormPresent).toEqual(true);
+      expect(info.IsXFAPresent).toEqual(false);
+      expect(info.IsCollectionPresent).toEqual(false);
+      expect(info.IsSignaturesPresent).toEqual(false);
+
+      expect(metadata).toEqual(null);
+      expect(contentDispositionFilename).toEqual(null);
+      expect(contentLength).toEqual(244351);
+
+      await loadingTask.destroy();
+    });
+
    it("gets markInfo", async function () {
      const loadingTask = getDocument(
        buildGetDocumentParams("annotation-line.pdf")