Move the Metadata parsing to the worker-thread

The only reason, as far as I can tell, for parsing the Metadata on the main-thread is how it was originally implemented. When Metadata support was first implemented, it utilized the [`DOMParser`](https://developer.mozilla.org/en-US/docs/Web/API/DOMParser) which isn't available in workers. Today, with the custom XML-parser being used, that's no longer an issue and it seems reasonable to move the Metadata parsing to the worker-thread[1], since that's where all parsing should happen (for performance reasons). Based on these changes, we'll be able to reduce the now unnecessary duplication of the XML-parser (and related code) in both of the *built* `pdf.js`/`pdf.worker.js` files. Finally, this patch changes the `_repair` method to use "Array + join" rather than string concatenation. --- [1] This needed the previous patch, to enable sending of `Map`s between threads with workers disabled.
2021-02-16 14:13:39 +01:00 · 2021-02-16 14:13:39 +01:00 · cc3a6563ee
commit cc3a6563ee
parent 73bf45e64b
6 changed files with 177 additions and 142 deletions
--- a/src/core/metadata_parser.js
+++ b/src/core/metadata_parser.js
@ -0,0 +1,146 @@
+/* Copyright 2012 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { SimpleXMLParser } from "../shared/xml_parser.js";
+
+class MetadataParser {
+  constructor(data) {
+    // Ghostscript may produce invalid metadata, so try to repair that first.
+    data = this._repair(data);
+
+    // Convert the string to an XML document.
+    const parser = new SimpleXMLParser({ lowerCaseName: true });
+    const xmlDocument = parser.parseFromString(data);
+
+    this._metadataMap = new Map();
+    this._data = data;
+
+    if (xmlDocument) {
+      this._parse(xmlDocument);
+    }
+  }
+
+  _repair(data) {
+    // Start by removing any "junk" before the first tag (see issue 10395).
+    return data
+      .replace(/^[^<]+/, "")
+      .replace(/>\\376\\377([^<]+)/g, function (all, codes) {
+        const bytes = codes
+          .replace(/\\([0-3])([0-7])([0-7])/g, function (code, d1, d2, d3) {
+            return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
+          })
+          .replace(/&(amp|apos|gt|lt|quot);/g, function (str, name) {
+            switch (name) {
+              case "amp":
+                return "&";
+              case "apos":
+                return "'";
+              case "gt":
+                return ">";
+              case "lt":
+                return "<";
+              case "quot":
+                return '"';
+            }
+            throw new Error(`_repair: ${name} isn't defined.`);
+          });
+
+        const charBuf = [];
+        for (let i = 0, ii = bytes.length; i < ii; i += 2) {
+          const code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
+          if (
+            code >= /* Space = */ 32 &&
+            code < /* Delete = */ 127 &&
+            code !== /* '<' = */ 60 &&
+            code !== /* '>' = */ 62 &&
+            code !== /* '&' = */ 38
+          ) {
+            charBuf.push(String.fromCharCode(code));
+          } else {
+            charBuf.push(
+              "&#x" + (0x10000 + code).toString(16).substring(1) + ";"
+            );
+          }
+        }
+        return ">" + charBuf.join("");
+      });
+  }
+
+  _getSequence(entry) {
+    const name = entry.nodeName;
+    if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") {
+      return null;
+    }
+    return entry.childNodes.filter(node => node.nodeName === "rdf:li");
+  }
+
+  _parseArray(entry) {
+    if (!entry.hasChildNodes()) {
+      return;
+    }
+    // Child must be a Bag (unordered array) or a Seq.
+    const [seqNode] = entry.childNodes;
+    const sequence = this._getSequence(seqNode) || [];
+
+    this._metadataMap.set(
+      entry.nodeName,
+      sequence.map(node => node.textContent.trim())
+    );
+  }
+
+  _parse(xmlDocument) {
+    let rdf = xmlDocument.documentElement;
+
+    if (rdf.nodeName !== "rdf:rdf") {
+      // Wrapped in <xmpmeta>
+      rdf = rdf.firstChild;
+      while (rdf && rdf.nodeName !== "rdf:rdf") {
+        rdf = rdf.nextSibling;
+      }
+    }
+
+    if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) {
+      return;
+    }
+
+    for (const desc of rdf.childNodes) {
+      if (desc.nodeName !== "rdf:description") {
+        continue;
+      }
+
+      for (const entry of desc.childNodes) {
+        const name = entry.nodeName;
+        switch (name) {
+          case "#text":
+            continue;
+          case "dc:creator":
+          case "dc:subject":
+            this._parseArray(entry);
+            continue;
+        }
+        this._metadataMap.set(name, entry.textContent.trim());
+      }
+    }
+  }
+
+  get serializable() {
+    return {
+      parsedData: this._metadataMap,
+      rawData: this._data,
+    };
+  }
+}
+
+export { MetadataParser };
--- a/src/core/obj.js
+++ b/src/core/obj.js
@ -59,6 +59,7 @@ import { Lexer, Parser } from "./parser.js";
 import { CipherTransformFactory } from "./crypto.js";
 import { ColorSpace } from "./colorspace.js";
 import { GlobalImageCache } from "./image_utils.js";
+import { MetadataParser } from "./metadata_parser.js";

 function fetchDestination(dest) {
  return isDict(dest) ? dest.get("D") : dest;
@ -131,20 +132,22 @@ class Catalog {
      this.xref.encrypt && this.xref.encrypt.encryptMetadata
    );
    const stream = this.xref.fetch(streamRef, suppressEncryption);
-    let metadata;
+    let metadata = null;

-    if (stream && isDict(stream.dict)) {
+    if (isStream(stream) && isDict(stream.dict)) {
      const type = stream.dict.get("Type");
      const subtype = stream.dict.get("Subtype");

      if (isName(type, "Metadata") && isName(subtype, "XML")) {
        // XXX: This should examine the charset the XML document defines,
-        // however since there are currently no real means to decode
-        // arbitrary charsets, let's just hope that the author of the PDF
-        // was reasonable enough to stick with the XML default charset,
-        // which is UTF-8.
+        // however since there are currently no real means to decode arbitrary
+        // charsets, let's just hope that the author of the PDF was reasonable
+        // enough to stick with the XML default charset, which is UTF-8.
        try {
-          metadata = stringToUTF8String(bytesToString(stream.getBytes()));
+          const data = stringToUTF8String(bytesToString(stream.getBytes()));
+          if (data) {
+            metadata = new MetadataParser(data).serializable;
+          }
        } catch (e) {
          if (e instanceof MissingDataException) {
            throw e;
--- a/src/display/metadata.js
+++ b/src/display/metadata.js
@ -13,129 +13,12 @@
 * limitations under the License.
 */

-import { assert, objectFromEntries } from "../shared/util.js";
-import { SimpleXMLParser } from "../shared/xml_parser.js";
+import { objectFromEntries } from "../shared/util.js";

 class Metadata {
-  constructor(data) {
-    assert(typeof data === "string", "Metadata: input is not a string");
-
-    // Ghostscript may produce invalid metadata, so try to repair that first.
-    data = this._repair(data);
-
-    // Convert the string to an XML document.
-    const parser = new SimpleXMLParser({ lowerCaseName: true });
-    const xmlDocument = parser.parseFromString(data);
-
-    this._metadataMap = new Map();
-
-    if (xmlDocument) {
-      this._parse(xmlDocument);
-    }
-    this._data = data;
-  }
-
-  _repair(data) {
-    // Start by removing any "junk" before the first tag (see issue 10395).
-    return data
-      .replace(/^[^<]+/, "")
-      .replace(/>\\376\\377([^<]+)/g, function (all, codes) {
-        const bytes = codes
-          .replace(/\\([0-3])([0-7])([0-7])/g, function (code, d1, d2, d3) {
-            return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
-          })
-          .replace(/&(amp|apos|gt|lt|quot);/g, function (str, name) {
-            switch (name) {
-              case "amp":
-                return "&";
-              case "apos":
-                return "'";
-              case "gt":
-                return ">";
-              case "lt":
-                return "<";
-              case "quot":
-                return '"';
-            }
-            throw new Error(`_repair: ${name} isn't defined.`);
-          });
-
-        let chars = "";
-        for (let i = 0, ii = bytes.length; i < ii; i += 2) {
-          const code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
-          if (
-            code >= /* Space = */ 32 &&
-            code < /* Delete = */ 127 &&
-            code !== /* '<' = */ 60 &&
-            code !== /* '>' = */ 62 &&
-            code !== /* '&' = */ 38
-          ) {
-            chars += String.fromCharCode(code);
-          } else {
-            chars += "&#x" + (0x10000 + code).toString(16).substring(1) + ";";
-          }
-        }
-
-        return ">" + chars;
-      });
-  }
-
-  _getSequence(entry) {
-    const name = entry.nodeName;
-    if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") {
-      return null;
-    }
-
-    return entry.childNodes.filter(node => node.nodeName === "rdf:li");
-  }
-
-  _parseArray(entry) {
-    if (!entry.hasChildNodes()) {
-      return;
-    }
-    // Child must be a Bag (unordered array) or a Seq.
-    const [seqNode] = entry.childNodes;
-    const sequence = this._getSequence(seqNode) || [];
-
-    this._metadataMap.set(
-      entry.nodeName,
-      sequence.map(node => node.textContent.trim())
-    );
-  }
-
-  _parse(xmlDocument) {
-    let rdf = xmlDocument.documentElement;
-
-    if (rdf.nodeName !== "rdf:rdf") {
-      // Wrapped in <xmpmeta>
-      rdf = rdf.firstChild;
-      while (rdf && rdf.nodeName !== "rdf:rdf") {
-        rdf = rdf.nextSibling;
-      }
-    }
-
-    if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) {
-      return;
-    }
-
-    for (const desc of rdf.childNodes) {
-      if (desc.nodeName !== "rdf:description") {
-        continue;
-      }
-
-      for (const entry of desc.childNodes) {
-        const name = entry.nodeName;
-        switch (name) {
-          case "#text":
-            continue;
-          case "dc:creator":
-          case "dc:subject":
-            this._parseArray(entry);
-            continue;
-        }
-        this._metadataMap.set(name, entry.textContent.trim());
-      }
-    }
+  constructor({ parsedData, rawData }) {
+    this._metadataMap = parsedData;
+    this._data = rawData;
  }

  getRaw() {
--- a/src/shared/xml_parser.js
+++ b/src/shared/xml_parser.js
@ -16,7 +16,7 @@
 // The code for XMLParserBase copied from
 // https://github.com/mozilla/shumway/blob/16451d8836fa85f4b16eeda8b4bda2fa9e2b22b0/src/avm2/natives/xml.ts

-import { encodeToXmlString } from "./util.js";
+import { encodeToXmlString } from "../shared/util.js";

 const XMLParserErrorCode = {
  NoError: 0,
--- a/test/unit/metadata_spec.js
+++ b/test/unit/metadata_spec.js
@ -15,6 +15,12 @@

 import { isEmptyObj } from "./test_utils.js";
 import { Metadata } from "../../src/display/metadata.js";
+import { MetadataParser } from "../../src/core/metadata_parser.js";
+
+function createMetadata(data) {
+  const metadataParser = new MetadataParser(data);
+  return new Metadata(metadataParser.serializable);
+}

 describe("metadata", function () {
  it("should handle valid metadata", function () {
@ -24,7 +30,7 @@ describe("metadata", function () {
      "<rdf:Description xmlns:dc='http://purl.org/dc/elements/1.1/'>" +
      '<dc:title><rdf:Alt><rdf:li xml:lang="x-default">Foo bar baz</rdf:li>' +
      "</rdf:Alt></dc:title></rdf:Description></rdf:RDF></x:xmpmeta>";
-    const metadata = new Metadata(data);
+    const metadata = createMetadata(data);

    expect(metadata.has("dc:title")).toBeTruthy();
    expect(metadata.has("dc:qux")).toBeFalsy();
@ -42,7 +48,7 @@ describe("metadata", function () {
      "<rdf:Description xmlns:dc='http://purl.org/dc/elements/1.1/'>" +
      "<dc:title>\\376\\377\\000P\\000D\\000F\\000&</dc:title>" +
      "</rdf:Description></rdf:RDF></x:xmpmeta>";
-    const metadata = new Metadata(data);
+    const metadata = createMetadata(data);

    expect(metadata.has("dc:title")).toBeTruthy();
    expect(metadata.has("dc:qux")).toBeFalsy();
@ -85,7 +91,7 @@ describe("metadata", function () {
      "<dc:creator><rdf:Seq><rdf:li>\\376\\377\\000O\\000D\\000I\\000S" +
      "</rdf:li></rdf:Seq></dc:creator></rdf:Description></rdf:RDF>" +
      "</x:xmpmeta>";
-    const metadata = new Metadata(data);
+    const metadata = createMetadata(data);

    expect(metadata.has("dc:title")).toBeTruthy();
    expect(metadata.has("dc:qux")).toBeFalsy();
@ -128,7 +134,7 @@ describe("metadata", function () {
      "</rdf:RDF>" +
      "</x:xmpmeta>" +
      '<?xpacket end="w"?>';
-    const metadata = new Metadata(data);
+    const metadata = createMetadata(data);

    expect(isEmptyObj(metadata.getAll())).toEqual(true);
  });
@ -159,7 +165,7 @@ describe("metadata", function () {
      '<dc:title><rdf:Alt><rdf:li xml:lang="x-default"></rdf:li>' +
      "</rdf:Alt></dc:title><dc:format>application/pdf</dc:format>" +
      '</rdf:Description></rdf:RDF></x:xmpmeta><?xpacket end="w"?>';
-    const metadata = new Metadata(data);
+    const metadata = createMetadata(data);

    expect(metadata.has("dc:title")).toBeTruthy();
    expect(metadata.has("dc:qux")).toBeFalsy();
@ -191,7 +197,7 @@ describe("metadata", function () {
      "<dc:title><rdf:Alt>" +
      '<rdf:li xml:lang="x-default">&apos;Foo bar baz&apos;</rdf:li>' +
      "</rdf:Alt></dc:title></rdf:Description></rdf:RDF></x:xmpmeta>";
-    const metadata = new Metadata(data);
+    const metadata = createMetadata(data);

    expect(metadata.has("dc:title")).toBeTruthy();
    expect(metadata.has("dc:qux")).toBeFalsy();
@ -220,7 +226,7 @@ describe("metadata", function () {
      "<xmpMM:DocumentID>uuid:00000000-1c84-3cf9-89ba-bef0e729c831" +
      "</xmpMM:DocumentID></rdf:Description>" +
      '</rdf:RDF></x:xmpmeta><?xpacket end="w"?>';
-    const metadata = new Metadata(data);
+    const metadata = createMetadata(data);

    expect(isEmptyObj(metadata.getAll())).toEqual(true);
  });
@ -249,7 +255,7 @@ describe("metadata", function () {
      "    </dc:title>" +
      "  </rdf:Description>" +
      "</rdf:RDF>";
-    const metadata = new Metadata(data);
+    const metadata = createMetadata(data);

    expect(metadata.has("dc:title")).toBeTruthy();
    expect(metadata.has("dc:qux")).toBeFalsy();
--- a/web/app.js
+++ b/web/app.js
@ -1755,11 +1755,8 @@ const PDFViewerApplication = {
        `${this.pdfViewer.enableWebGL ? " [WebGL]" : ""})`
    );

-    let pdfTitle;
-    const infoTitle = info?.Title;
-    if (infoTitle) {
-      pdfTitle = infoTitle;
-    }
+    let pdfTitle = info?.Title;
+
    const metadataTitle = metadata?.get("dc:title");
    if (metadataTitle) {
      // Ghostscript can produce invalid 'dc:title' Metadata entries: