Replace DOMParser with SimpleXMLParser

The `DOMParser` is most likely overkill and may be less secure.
Moreover, it is not supported in Node.js environments.

This patch replaces the `DOMParser` with a simple XML parser. This
should be faster and gives us Node.js support for free. The simple XML
parser is a port of the one that existed in the examples folder with a
small regex fix to make the parsing work correctly.

The unit tests are extended for increased test coverage of the metadata
code. The new method `getAll` is provided so the example does not have
to access internal properties of the object anymore.
This commit is contained in:
Tim van der Meij 2017-09-13 23:37:51 +02:00
parent bc9afdf3c4
commit d4309614f9
No known key found for this signature in database
GPG Key ID: 8C3FD2925A5F2762
5 changed files with 179 additions and 128 deletions

View File

@ -1,105 +0,0 @@
/* Any copyright is dedicated to the Public Domain.
* http://creativecommons.org/publicdomain/zero/1.0/ */
// Dummy XML Parser
function DOMNodeMock(nodeName, nodeValue) {
this.nodeName = nodeName;
this.nodeValue = nodeValue;
Object.defineProperty(this, 'parentNode', {value: null, writable: true});
}
DOMNodeMock.prototype = {
get firstChild() {
return this.childNodes[0];
},
get nextSibling() {
var index = this.parentNode.childNodes.indexOf(this);
return this.parentNode.childNodes[index + 1];
},
get textContent() {
if (!this.childNodes) {
return this.nodeValue || '';
}
return this.childNodes.map(function (child) {
return child.textContent;
}).join('');
},
hasChildNodes: function () {
return this.childNodes && this.childNodes.length > 0;
}
};
function decodeXML(text) {
if (text.indexOf('&') < 0) {
return text;
}
return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi, function (all, entityName, number) {
if (number) {
return String.fromCharCode(number[0] === 'x' ? parseInt(number.substring(1), 16) : +number);
}
switch (entityName) {
case 'amp':
return '&';
case 'lt':
return '<';
case 'gt':
return '>';
case 'quot':
return '\"';
case 'apos':
return '\'';
}
return '&' + entityName + ';';
});
}
function DOMParserMock() {};
DOMParserMock.prototype = {
parseFromString: function (content) {
content = content.replace(/<\?[\s\S]*?\?>|<!--[\s\S]*?-->/g, '').trim();
var nodes = [];
content = content.replace(/>([\s\S]+?)</g, function (all, text) {
var i = nodes.length;
var node = new DOMNodeMock('#text', decodeXML(text));
nodes.push(node);
if (node.textContent.trim().length === 0) {
return '><'; // ignoring whitespaces
}
return '>' + i + ',<';
});
content = content.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, function (all, text) {
var i = nodes.length;
var node = new DOMNodeMock('#text', text);
nodes.push(node);
return i + ',';
});
var lastLength;
do {
lastLength = nodes.length;
content = content.replace(/<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g,
function (all, name, attrs, content) {
var i = nodes.length;
var node = new DOMNodeMock(name);
var children = [];
if (content) {
content = content.split(',');
content.pop();
content.forEach(function (child) {
var childNode = nodes[+child];
childNode.parentNode = node;
children.push(childNode);
})
}
node.childNodes = children;
nodes.push(node);
return i + ',';
});
} while(lastLength < nodes.length);
return {
documentElement: nodes.pop()
};
}
};
exports.DOMParserMock = DOMParserMock;

View File

@ -9,9 +9,6 @@
var fs = require('fs');
// HACK adding DOMParser to read XMP metadata.
global.DOMParser = require('./domparsermock.js').DOMParserMock;
// Run `gulp dist-install` to generate 'pdfjs-dist' npm package files.
var pdfjsLib = require('pdfjs-dist');
@ -34,7 +31,7 @@ pdfjsLib.getDocument(pdfPath).then(function (doc) {
console.log();
if (data.metadata) {
console.log('## Metadata');
console.log(JSON.stringify(data.metadata.metadata, null, 2));
console.log(JSON.stringify(data.metadata.getAll(), null, 2));
console.log();
}
});

View File

@ -131,6 +131,132 @@ class DOMSVGFactory {
}
}
class SimpleDOMNode {
constructor(nodeName, nodeValue) {
this.nodeName = nodeName;
this.nodeValue = nodeValue;
Object.defineProperty(this, 'parentNode', { value: null, writable: true, });
}
get firstChild() {
return this.childNodes[0];
}
get nextSibling() {
let index = this.parentNode.childNodes.indexOf(this);
return this.parentNode.childNodes[index + 1];
}
get textContent() {
if (!this.childNodes) {
return this.nodeValue || '';
}
return this.childNodes.map(function(child) {
return child.textContent;
}).join('');
}
hasChildNodes() {
return this.childNodes && this.childNodes.length > 0;
}
}
class SimpleXMLParser {
parseFromString(data) {
let nodes = [];
// Remove all comments and processing instructions.
data = data.replace(/<\?[\s\S]*?\?>|<!--[\s\S]*?-->/g, '').trim();
data = data.replace(/<!DOCTYPE[^>\[]+(\[[^\]]+)?[^>]+>/g, '').trim();
// Extract all text nodes and replace them with a numeric index in
// the nodes.
data = data.replace(/>([^<][\s\S]*?)</g, (all, text) => {
let length = nodes.length;
let node = new SimpleDOMNode('#text', this._decodeXML(text));
nodes.push(node);
if (node.textContent.trim().length === 0) {
return '><'; // Ignore whitespace.
}
return '>' + length + ',<';
});
// Extract all CDATA nodes.
data = data.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g,
function(all, text) {
let length = nodes.length;
let node = new SimpleDOMNode('#text', text);
nodes.push(node);
return length + ',';
});
// Until nodes without '<' and '>' content are present, replace them
// with a numeric index in the nodes.
let regex =
/<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g;
let lastLength;
do {
lastLength = nodes.length;
data = data.replace(regex, function(all, name, attrs, data) {
let length = nodes.length;
let node = new SimpleDOMNode(name);
let children = [];
if (data) {
data = data.split(',');
data.pop();
data.forEach(function(child) {
let childNode = nodes[+child];
childNode.parentNode = node;
children.push(childNode);
});
}
node.childNodes = children;
nodes.push(node);
return length + ',';
});
} while (lastLength < nodes.length);
// We should only have one root index left, which will be last in the nodes.
return {
documentElement: nodes.pop(),
};
}
_decodeXML(text) {
if (text.indexOf('&') < 0) {
return text;
}
return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi,
function(all, entityName, number) {
if (number) {
if (number[0] === 'x') {
number = parseInt(number.substring(1), 16);
} else {
number = +number;
}
return String.fromCharCode(number);
}
switch (entityName) {
case 'amp':
return '&';
case 'lt':
return '<';
case 'gt':
return '>';
case 'quot':
return '\"';
case 'apos':
return '\'';
}
return '&' + entityName + ';';
});
}
}
/**
* Optimised CSS custom property getter/setter.
* @class
@ -353,4 +479,5 @@ export {
DOMCanvasFactory,
DOMCMapReaderFactory,
DOMSVGFactory,
SimpleXMLParser,
};

View File

@ -13,18 +13,19 @@
* limitations under the License.
*/
import { assert, deprecated } from '../shared/util';
import { SimpleXMLParser } from './dom_utils';
class Metadata {
constructor(data) {
if (typeof data === 'string') {
// Ghostscript may produce invalid metadata, so try to repair that first.
data = this._repair(data);
assert(typeof data === 'string', 'Metadata: input is not a string');
// Convert the string to a DOM `Document`.
let parser = new DOMParser();
data = parser.parseFromString(data, 'application/xml');
} else if (!(data instanceof Document)) {
throw new Error('Metadata: input is not a string or `Document`');
}
// Ghostscript may produce invalid metadata, so try to repair that first.
data = this._repair(data);
// Convert the string to a DOM `Document`.
let parser = new SimpleXMLParser();
data = parser.parseFromString(data);
this._metadata = Object.create(null);
@ -90,9 +91,18 @@ class Metadata {
return this._metadata[name] || null;
}
getAll() {
return this._metadata;
}
has(name) {
return typeof this._metadata[name] !== 'undefined';
}
get metadata() {
deprecated('`metadata` getter; use `getAll()` instead.');
return this.getAll();
}
}
export {

View File

@ -16,15 +16,37 @@
import { Metadata } from '../../src/display/metadata';
describe('metadata', function() {
describe('incorrect_xmp', function() {
it('should fix the incorrect XMP data', function() {
var invalidXMP = '<x:xmpmeta xmlns:x=\'adobe:ns:meta/\'>' +
'<rdf:RDF xmlns:rdf=\'http://www.w3.org/1999/02/22-rdf-syntax-ns#\'>' +
'<rdf:Description xmlns:dc=\'http://purl.org/dc/elements/1.1/\'>' +
'<dc:title>\\376\\377\\000P\\000D\\000F\\000&</dc:title>' +
'</rdf:Description></rdf:RDF></x:xmpmeta>';
var meta = new Metadata(invalidXMP);
expect(meta.get('dc:title')).toEqual('PDF&');
});
it('should handle valid metadata', function() {
var validData = '<x:xmpmeta xmlns:x=\'adobe:ns:meta/\'>' +
'<rdf:RDF xmlns:rdf=\'http://www.w3.org/1999/02/22-rdf-syntax-ns#\'>' +
'<rdf:Description xmlns:dc=\'http://purl.org/dc/elements/1.1/\'>' +
'<dc:title><rdf:Alt><rdf:li xml:lang="x-default">Foo bar baz</rdf:li>' +
'</rdf:Alt></dc:title></rdf:Description></rdf:RDF></x:xmpmeta>';
var metadata = new Metadata(validData);
expect(metadata.has('dc:title')).toBeTruthy();
expect(metadata.has('dc:qux')).toBeFalsy();
expect(metadata.get('dc:title')).toEqual('Foo bar baz');
expect(metadata.get('dc:qux')).toEqual(null);
expect(metadata.getAll()).toEqual({ 'dc:title': 'Foo bar baz', });
});
it('should repair and handle invalid metadata', function() {
var invalidData = '<x:xmpmeta xmlns:x=\'adobe:ns:meta/\'>' +
'<rdf:RDF xmlns:rdf=\'http://www.w3.org/1999/02/22-rdf-syntax-ns#\'>' +
'<rdf:Description xmlns:dc=\'http://purl.org/dc/elements/1.1/\'>' +
'<dc:title>\\376\\377\\000P\\000D\\000F\\000&</dc:title>' +
'</rdf:Description></rdf:RDF></x:xmpmeta>';
var metadata = new Metadata(invalidData);
expect(metadata.has('dc:title')).toBeTruthy();
expect(metadata.has('dc:qux')).toBeFalsy();
expect(metadata.get('dc:title')).toEqual('PDF&');
expect(metadata.get('dc:qux')).toEqual(null);
expect(metadata.getAll()).toEqual({ 'dc:title': 'PDF&', });
});
});