Move the Metadata parsing to the worker-thread

The only reason, as far as I can tell, for parsing the Metadata on the main-thread is how it was originally implemented. When Metadata support was first implemented, it utilized the [`DOMParser`](https://developer.mozilla.org/en-US/docs/Web/API/DOMParser) which isn't available in workers.
Today, with the custom XML-parser being used, that's no longer an issue and it seems reasonable to move the Metadata parsing to the worker-thread[1], since that's where all parsing should happen (for performance reasons).

Based on these changes, we'll be able to reduce the now unnecessary duplication of the XML-parser (and related code) in both of the *built* `pdf.js`/`pdf.worker.js` files.

Finally, this patch changes the `_repair` method to use "Array + join" rather than string concatenation.

---
[1] This needed the previous patch, to enable sending of `Map`s between threads with workers disabled.
This commit is contained in:
Jonas Jenwald 2021-02-16 14:13:39 +01:00
parent 73bf45e64b
commit cc3a6563ee
6 changed files with 177 additions and 142 deletions

146
src/core/metadata_parser.js Normal file
View File

@ -0,0 +1,146 @@
/* Copyright 2012 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { SimpleXMLParser } from "../shared/xml_parser.js";
class MetadataParser {
constructor(data) {
// Ghostscript may produce invalid metadata, so try to repair that first.
data = this._repair(data);
// Convert the string to an XML document.
const parser = new SimpleXMLParser({ lowerCaseName: true });
const xmlDocument = parser.parseFromString(data);
this._metadataMap = new Map();
this._data = data;
if (xmlDocument) {
this._parse(xmlDocument);
}
}
_repair(data) {
// Start by removing any "junk" before the first tag (see issue 10395).
return data
.replace(/^[^<]+/, "")
.replace(/>\\376\\377([^<]+)/g, function (all, codes) {
const bytes = codes
.replace(/\\([0-3])([0-7])([0-7])/g, function (code, d1, d2, d3) {
return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
})
.replace(/&(amp|apos|gt|lt|quot);/g, function (str, name) {
switch (name) {
case "amp":
return "&";
case "apos":
return "'";
case "gt":
return ">";
case "lt":
return "<";
case "quot":
return '"';
}
throw new Error(`_repair: ${name} isn't defined.`);
});
const charBuf = [];
for (let i = 0, ii = bytes.length; i < ii; i += 2) {
const code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
if (
code >= /* Space = */ 32 &&
code < /* Delete = */ 127 &&
code !== /* '<' = */ 60 &&
code !== /* '>' = */ 62 &&
code !== /* '&' = */ 38
) {
charBuf.push(String.fromCharCode(code));
} else {
charBuf.push(
"&#x" + (0x10000 + code).toString(16).substring(1) + ";"
);
}
}
return ">" + charBuf.join("");
});
}
_getSequence(entry) {
const name = entry.nodeName;
if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") {
return null;
}
return entry.childNodes.filter(node => node.nodeName === "rdf:li");
}
_parseArray(entry) {
if (!entry.hasChildNodes()) {
return;
}
// Child must be a Bag (unordered array) or a Seq.
const [seqNode] = entry.childNodes;
const sequence = this._getSequence(seqNode) || [];
this._metadataMap.set(
entry.nodeName,
sequence.map(node => node.textContent.trim())
);
}
_parse(xmlDocument) {
let rdf = xmlDocument.documentElement;
if (rdf.nodeName !== "rdf:rdf") {
// Wrapped in <xmpmeta>
rdf = rdf.firstChild;
while (rdf && rdf.nodeName !== "rdf:rdf") {
rdf = rdf.nextSibling;
}
}
if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) {
return;
}
for (const desc of rdf.childNodes) {
if (desc.nodeName !== "rdf:description") {
continue;
}
for (const entry of desc.childNodes) {
const name = entry.nodeName;
switch (name) {
case "#text":
continue;
case "dc:creator":
case "dc:subject":
this._parseArray(entry);
continue;
}
this._metadataMap.set(name, entry.textContent.trim());
}
}
}
get serializable() {
return {
parsedData: this._metadataMap,
rawData: this._data,
};
}
}
export { MetadataParser };

View File

@ -59,6 +59,7 @@ import { Lexer, Parser } from "./parser.js";
import { CipherTransformFactory } from "./crypto.js";
import { ColorSpace } from "./colorspace.js";
import { GlobalImageCache } from "./image_utils.js";
import { MetadataParser } from "./metadata_parser.js";
function fetchDestination(dest) {
return isDict(dest) ? dest.get("D") : dest;
@ -131,20 +132,22 @@ class Catalog {
this.xref.encrypt && this.xref.encrypt.encryptMetadata
);
const stream = this.xref.fetch(streamRef, suppressEncryption);
let metadata;
let metadata = null;
if (stream && isDict(stream.dict)) {
if (isStream(stream) && isDict(stream.dict)) {
const type = stream.dict.get("Type");
const subtype = stream.dict.get("Subtype");
if (isName(type, "Metadata") && isName(subtype, "XML")) {
// XXX: This should examine the charset the XML document defines,
// however since there are currently no real means to decode
// arbitrary charsets, let's just hope that the author of the PDF
// was reasonable enough to stick with the XML default charset,
// which is UTF-8.
// however since there are currently no real means to decode arbitrary
// charsets, let's just hope that the author of the PDF was reasonable
// enough to stick with the XML default charset, which is UTF-8.
try {
metadata = stringToUTF8String(bytesToString(stream.getBytes()));
const data = stringToUTF8String(bytesToString(stream.getBytes()));
if (data) {
metadata = new MetadataParser(data).serializable;
}
} catch (e) {
if (e instanceof MissingDataException) {
throw e;

View File

@ -13,129 +13,12 @@
* limitations under the License.
*/
import { assert, objectFromEntries } from "../shared/util.js";
import { SimpleXMLParser } from "../shared/xml_parser.js";
import { objectFromEntries } from "../shared/util.js";
class Metadata {
constructor(data) {
assert(typeof data === "string", "Metadata: input is not a string");
// Ghostscript may produce invalid metadata, so try to repair that first.
data = this._repair(data);
// Convert the string to an XML document.
const parser = new SimpleXMLParser({ lowerCaseName: true });
const xmlDocument = parser.parseFromString(data);
this._metadataMap = new Map();
if (xmlDocument) {
this._parse(xmlDocument);
}
this._data = data;
}
_repair(data) {
// Start by removing any "junk" before the first tag (see issue 10395).
return data
.replace(/^[^<]+/, "")
.replace(/>\\376\\377([^<]+)/g, function (all, codes) {
const bytes = codes
.replace(/\\([0-3])([0-7])([0-7])/g, function (code, d1, d2, d3) {
return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
})
.replace(/&(amp|apos|gt|lt|quot);/g, function (str, name) {
switch (name) {
case "amp":
return "&";
case "apos":
return "'";
case "gt":
return ">";
case "lt":
return "<";
case "quot":
return '"';
}
throw new Error(`_repair: ${name} isn't defined.`);
});
let chars = "";
for (let i = 0, ii = bytes.length; i < ii; i += 2) {
const code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
if (
code >= /* Space = */ 32 &&
code < /* Delete = */ 127 &&
code !== /* '<' = */ 60 &&
code !== /* '>' = */ 62 &&
code !== /* '&' = */ 38
) {
chars += String.fromCharCode(code);
} else {
chars += "&#x" + (0x10000 + code).toString(16).substring(1) + ";";
}
}
return ">" + chars;
});
}
_getSequence(entry) {
const name = entry.nodeName;
if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") {
return null;
}
return entry.childNodes.filter(node => node.nodeName === "rdf:li");
}
_parseArray(entry) {
if (!entry.hasChildNodes()) {
return;
}
// Child must be a Bag (unordered array) or a Seq.
const [seqNode] = entry.childNodes;
const sequence = this._getSequence(seqNode) || [];
this._metadataMap.set(
entry.nodeName,
sequence.map(node => node.textContent.trim())
);
}
_parse(xmlDocument) {
let rdf = xmlDocument.documentElement;
if (rdf.nodeName !== "rdf:rdf") {
// Wrapped in <xmpmeta>
rdf = rdf.firstChild;
while (rdf && rdf.nodeName !== "rdf:rdf") {
rdf = rdf.nextSibling;
}
}
if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) {
return;
}
for (const desc of rdf.childNodes) {
if (desc.nodeName !== "rdf:description") {
continue;
}
for (const entry of desc.childNodes) {
const name = entry.nodeName;
switch (name) {
case "#text":
continue;
case "dc:creator":
case "dc:subject":
this._parseArray(entry);
continue;
}
this._metadataMap.set(name, entry.textContent.trim());
}
}
constructor({ parsedData, rawData }) {
this._metadataMap = parsedData;
this._data = rawData;
}
getRaw() {

View File

@ -16,7 +16,7 @@
// The code for XMLParserBase copied from
// https://github.com/mozilla/shumway/blob/16451d8836fa85f4b16eeda8b4bda2fa9e2b22b0/src/avm2/natives/xml.ts
import { encodeToXmlString } from "./util.js";
import { encodeToXmlString } from "../shared/util.js";
const XMLParserErrorCode = {
NoError: 0,

View File

@ -15,6 +15,12 @@
import { isEmptyObj } from "./test_utils.js";
import { Metadata } from "../../src/display/metadata.js";
import { MetadataParser } from "../../src/core/metadata_parser.js";
function createMetadata(data) {
const metadataParser = new MetadataParser(data);
return new Metadata(metadataParser.serializable);
}
describe("metadata", function () {
it("should handle valid metadata", function () {
@ -24,7 +30,7 @@ describe("metadata", function () {
"<rdf:Description xmlns:dc='http://purl.org/dc/elements/1.1/'>" +
'<dc:title><rdf:Alt><rdf:li xml:lang="x-default">Foo bar baz</rdf:li>' +
"</rdf:Alt></dc:title></rdf:Description></rdf:RDF></x:xmpmeta>";
const metadata = new Metadata(data);
const metadata = createMetadata(data);
expect(metadata.has("dc:title")).toBeTruthy();
expect(metadata.has("dc:qux")).toBeFalsy();
@ -42,7 +48,7 @@ describe("metadata", function () {
"<rdf:Description xmlns:dc='http://purl.org/dc/elements/1.1/'>" +
"<dc:title>\\376\\377\\000P\\000D\\000F\\000&</dc:title>" +
"</rdf:Description></rdf:RDF></x:xmpmeta>";
const metadata = new Metadata(data);
const metadata = createMetadata(data);
expect(metadata.has("dc:title")).toBeTruthy();
expect(metadata.has("dc:qux")).toBeFalsy();
@ -85,7 +91,7 @@ describe("metadata", function () {
"<dc:creator><rdf:Seq><rdf:li>\\376\\377\\000O\\000D\\000I\\000S" +
"</rdf:li></rdf:Seq></dc:creator></rdf:Description></rdf:RDF>" +
"</x:xmpmeta>";
const metadata = new Metadata(data);
const metadata = createMetadata(data);
expect(metadata.has("dc:title")).toBeTruthy();
expect(metadata.has("dc:qux")).toBeFalsy();
@ -128,7 +134,7 @@ describe("metadata", function () {
"</rdf:RDF>" +
"</x:xmpmeta>" +
'<?xpacket end="w"?>';
const metadata = new Metadata(data);
const metadata = createMetadata(data);
expect(isEmptyObj(metadata.getAll())).toEqual(true);
});
@ -159,7 +165,7 @@ describe("metadata", function () {
'<dc:title><rdf:Alt><rdf:li xml:lang="x-default"></rdf:li>' +
"</rdf:Alt></dc:title><dc:format>application/pdf</dc:format>" +
'</rdf:Description></rdf:RDF></x:xmpmeta><?xpacket end="w"?>';
const metadata = new Metadata(data);
const metadata = createMetadata(data);
expect(metadata.has("dc:title")).toBeTruthy();
expect(metadata.has("dc:qux")).toBeFalsy();
@ -191,7 +197,7 @@ describe("metadata", function () {
"<dc:title><rdf:Alt>" +
'<rdf:li xml:lang="x-default">&apos;Foo bar baz&apos;</rdf:li>' +
"</rdf:Alt></dc:title></rdf:Description></rdf:RDF></x:xmpmeta>";
const metadata = new Metadata(data);
const metadata = createMetadata(data);
expect(metadata.has("dc:title")).toBeTruthy();
expect(metadata.has("dc:qux")).toBeFalsy();
@ -220,7 +226,7 @@ describe("metadata", function () {
"<xmpMM:DocumentID>uuid:00000000-1c84-3cf9-89ba-bef0e729c831" +
"</xmpMM:DocumentID></rdf:Description>" +
'</rdf:RDF></x:xmpmeta><?xpacket end="w"?>';
const metadata = new Metadata(data);
const metadata = createMetadata(data);
expect(isEmptyObj(metadata.getAll())).toEqual(true);
});
@ -249,7 +255,7 @@ describe("metadata", function () {
" </dc:title>" +
" </rdf:Description>" +
"</rdf:RDF>";
const metadata = new Metadata(data);
const metadata = createMetadata(data);
expect(metadata.has("dc:title")).toBeTruthy();
expect(metadata.has("dc:qux")).toBeFalsy();

View File

@ -1755,11 +1755,8 @@ const PDFViewerApplication = {
`${this.pdfViewer.enableWebGL ? " [WebGL]" : ""})`
);
let pdfTitle;
const infoTitle = info?.Title;
if (infoTitle) {
pdfTitle = infoTitle;
}
let pdfTitle = info?.Title;
const metadataTitle = metadata?.get("dc:title");
if (metadataTitle) {
// Ghostscript can produce invalid 'dc:title' Metadata entries: