diff --git a/examples/node/domparsermock.js b/examples/node/domparsermock.js deleted file mode 100644 index 1dde248c3..000000000 --- a/examples/node/domparsermock.js +++ /dev/null @@ -1,105 +0,0 @@ -/* Any copyright is dedicated to the Public Domain. - * http://creativecommons.org/publicdomain/zero/1.0/ */ - -// Dummy XML Parser - -function DOMNodeMock(nodeName, nodeValue) { - this.nodeName = nodeName; - this.nodeValue = nodeValue; - Object.defineProperty(this, 'parentNode', {value: null, writable: true}); -} -DOMNodeMock.prototype = { - get firstChild() { - return this.childNodes[0]; - }, - get nextSibling() { - var index = this.parentNode.childNodes.indexOf(this); - return this.parentNode.childNodes[index + 1]; - }, - get textContent() { - if (!this.childNodes) { - return this.nodeValue || ''; - } - return this.childNodes.map(function (child) { - return child.textContent; - }).join(''); - }, - hasChildNodes: function () { - return this.childNodes && this.childNodes.length > 0; - } -}; - -function decodeXML(text) { - if (text.indexOf('&') < 0) { - return text; - } - return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi, function (all, entityName, number) { - if (number) { - return String.fromCharCode(number[0] === 'x' ? parseInt(number.substring(1), 16) : +number); - } - switch (entityName) { - case 'amp': - return '&'; - case 'lt': - return '<'; - case 'gt': - return '>'; - case 'quot': - return '\"'; - case 'apos': - return '\''; - } - return '&' + entityName + ';'; - }); -} - -function DOMParserMock() {}; -DOMParserMock.prototype = { - parseFromString: function (content) { - content = content.replace(/<\?[\s\S]*?\?>|/g, '').trim(); - var nodes = []; - content = content.replace(/>([\s\S]+?)<'; // ignoring whitespaces - } - return '>' + i + ',<'; - }); - content = content.replace(//g, function (all, text) { - var i = nodes.length; - var node = new DOMNodeMock('#text', text); - nodes.push(node); - return i + ','; - }); - var lastLength; - do { - lastLength = nodes.length; - content = content.replace(/<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g, - function (all, name, attrs, content) { - var i = nodes.length; - var node = new DOMNodeMock(name); - var children = []; - if (content) { - content = content.split(','); - content.pop(); - content.forEach(function (child) { - var childNode = nodes[+child]; - childNode.parentNode = node; - children.push(childNode); - }) - } - node.childNodes = children; - nodes.push(node); - return i + ','; - - }); - } while(lastLength < nodes.length); - return { - documentElement: nodes.pop() - }; - } -}; - -exports.DOMParserMock = DOMParserMock; diff --git a/examples/node/getinfo.js b/examples/node/getinfo.js index 3dce2e20a..61034cfa3 100644 --- a/examples/node/getinfo.js +++ b/examples/node/getinfo.js @@ -9,9 +9,6 @@ var fs = require('fs'); -// HACK adding DOMParser to read XMP metadata. -global.DOMParser = require('./domparsermock.js').DOMParserMock; - // Run `gulp dist-install` to generate 'pdfjs-dist' npm package files. var pdfjsLib = require('pdfjs-dist'); @@ -34,7 +31,7 @@ pdfjsLib.getDocument(pdfPath).then(function (doc) { console.log(); if (data.metadata) { console.log('## Metadata'); - console.log(JSON.stringify(data.metadata.metadata, null, 2)); + console.log(JSON.stringify(data.metadata.getAll(), null, 2)); console.log(); } }); diff --git a/src/display/dom_utils.js b/src/display/dom_utils.js index 4d0116849..21cfb0425 100644 --- a/src/display/dom_utils.js +++ b/src/display/dom_utils.js @@ -131,6 +131,132 @@ class DOMSVGFactory { } } +class SimpleDOMNode { + constructor(nodeName, nodeValue) { + this.nodeName = nodeName; + this.nodeValue = nodeValue; + + Object.defineProperty(this, 'parentNode', { value: null, writable: true, }); + } + + get firstChild() { + return this.childNodes[0]; + } + + get nextSibling() { + let index = this.parentNode.childNodes.indexOf(this); + return this.parentNode.childNodes[index + 1]; + } + + get textContent() { + if (!this.childNodes) { + return this.nodeValue || ''; + } + return this.childNodes.map(function(child) { + return child.textContent; + }).join(''); + } + + hasChildNodes() { + return this.childNodes && this.childNodes.length > 0; + } +} + +class SimpleXMLParser { + parseFromString(data) { + let nodes = []; + + // Remove all comments and processing instructions. + data = data.replace(/<\?[\s\S]*?\?>|/g, '').trim(); + data = data.replace(/\[]+(\[[^\]]+)?[^>]+>/g, '').trim(); + + // Extract all text nodes and replace them with a numeric index in + // the nodes. + data = data.replace(/>([^<][\s\S]*?) { + let length = nodes.length; + let node = new SimpleDOMNode('#text', this._decodeXML(text)); + nodes.push(node); + if (node.textContent.trim().length === 0) { + return '><'; // Ignore whitespace. + } + return '>' + length + ',<'; + }); + + // Extract all CDATA nodes. + data = data.replace(//g, + function(all, text) { + let length = nodes.length; + let node = new SimpleDOMNode('#text', text); + nodes.push(node); + return length + ','; + }); + + // Until nodes without '<' and '>' content are present, replace them + // with a numeric index in the nodes. + let regex = + /<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g; + let lastLength; + do { + lastLength = nodes.length; + data = data.replace(regex, function(all, name, attrs, data) { + let length = nodes.length; + let node = new SimpleDOMNode(name); + let children = []; + if (data) { + data = data.split(','); + data.pop(); + data.forEach(function(child) { + let childNode = nodes[+child]; + childNode.parentNode = node; + children.push(childNode); + }); + } + + node.childNodes = children; + nodes.push(node); + return length + ','; + }); + } while (lastLength < nodes.length); + + // We should only have one root index left, which will be last in the nodes. + return { + documentElement: nodes.pop(), + }; + } + + _decodeXML(text) { + if (text.indexOf('&') < 0) { + return text; + } + + return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi, + function(all, entityName, number) { + if (number) { + if (number[0] === 'x') { + number = parseInt(number.substring(1), 16); + } else { + number = +number; + } + return String.fromCharCode(number); + } + + switch (entityName) { + case 'amp': + return '&'; + case 'lt': + return '<'; + case 'gt': + return '>'; + case 'quot': + return '\"'; + case 'apos': + return '\''; + } + return '&' + entityName + ';'; + }); + } +} + /** * Optimised CSS custom property getter/setter. * @class @@ -353,4 +479,5 @@ export { DOMCanvasFactory, DOMCMapReaderFactory, DOMSVGFactory, + SimpleXMLParser, }; diff --git a/src/display/metadata.js b/src/display/metadata.js index a77a138e7..7878ec838 100644 --- a/src/display/metadata.js +++ b/src/display/metadata.js @@ -13,43 +13,49 @@ * limitations under the License. */ -function fixMetadata(meta) { - return meta.replace(/>\\376\\377([^<]+)/g, function(all, codes) { - var bytes = codes.replace(/\\([0-3])([0-7])([0-7])/g, - function(code, d1, d2, d3) { - return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1); - }); - var chars = ''; - for (var i = 0; i < bytes.length; i += 2) { - var code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1); - chars += (code >= 32 && code < 127 && code !== 60 && code !== 62 && - code !== 38) ? String.fromCharCode(code) : - '&#x' + (0x10000 + code).toString(16).substring(1) + ';'; - } - return '>' + chars; - }); -} +import { assert, deprecated } from '../shared/util'; +import { SimpleXMLParser } from './dom_utils'; -function Metadata(meta) { - if (typeof meta === 'string') { - // Ghostscript produces invalid metadata - meta = fixMetadata(meta); +class Metadata { + constructor(data) { + assert(typeof data === 'string', 'Metadata: input is not a string'); - var parser = new DOMParser(); - meta = parser.parseFromString(meta, 'application/xml'); - } else if (!(meta instanceof Document)) { - throw new Error('Metadata: Invalid metadata object'); + // Ghostscript may produce invalid metadata, so try to repair that first. + data = this._repair(data); + + // Convert the string to a DOM `Document`. + let parser = new SimpleXMLParser(); + data = parser.parseFromString(data); + + this._metadata = Object.create(null); + + this._parse(data); } - this.metaDocument = meta; - this.metadata = Object.create(null); - this.parse(); -} + _repair(data) { + return data.replace(/>\\376\\377([^<]+)/g, function(all, codes) { + let bytes = codes.replace(/\\([0-3])([0-7])([0-7])/g, + function(code, d1, d2, d3) { + return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1); + }); -Metadata.prototype = { - parse: function Metadata_parse() { - var doc = this.metaDocument; - var rdf = doc.documentElement; + let chars = ''; + for (let i = 0, ii = bytes.length; i < ii; i += 2) { + let code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1); + if (code >= 32 && code < 127 && code !== 60 && code !== 62 && + code !== 38) { + chars += String.fromCharCode(code); + } else { + chars += '&#x' + (0x10000 + code).toString(16).substring(1) + ';'; + } + } + + return '>' + chars; + }); + } + + _parse(domDocument) { + let rdf = domDocument.documentElement; if (rdf.nodeName.toLowerCase() !== 'rdf:rdf') { // Wrapped in rdf = rdf.firstChild; @@ -58,36 +64,46 @@ Metadata.prototype = { } } - var nodeName = (rdf) ? rdf.nodeName.toLowerCase() : null; + let nodeName = rdf ? rdf.nodeName.toLowerCase() : null; if (!rdf || nodeName !== 'rdf:rdf' || !rdf.hasChildNodes()) { return; } - var children = rdf.childNodes, desc, entry, name, i, ii, length, iLength; - for (i = 0, length = children.length; i < length; i++) { - desc = children[i]; + let children = rdf.childNodes; + for (let i = 0, ii = children.length; i < ii; i++) { + let desc = children[i]; if (desc.nodeName.toLowerCase() !== 'rdf:description') { continue; } - for (ii = 0, iLength = desc.childNodes.length; ii < iLength; ii++) { - if (desc.childNodes[ii].nodeName.toLowerCase() !== '#text') { - entry = desc.childNodes[ii]; - name = entry.nodeName.toLowerCase(); - this.metadata[name] = entry.textContent.trim(); + for (let j = 0, jj = desc.childNodes.length; j < jj; j++) { + if (desc.childNodes[j].nodeName.toLowerCase() !== '#text') { + let entry = desc.childNodes[j]; + let name = entry.nodeName.toLowerCase(); + + this._metadata[name] = entry.textContent.trim(); } } } - }, + } - get: function Metadata_get(name) { - return this.metadata[name] || null; - }, + get(name) { + return this._metadata[name] || null; + } - has: function Metadata_has(name) { - return typeof this.metadata[name] !== 'undefined'; - }, -}; + getAll() { + return this._metadata; + } + + has(name) { + return typeof this._metadata[name] !== 'undefined'; + } + + get metadata() { + deprecated('`metadata` getter; use `getAll()` instead.'); + return this.getAll(); + } +} export { Metadata, diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index d5e5e48da..660b56d4e 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -790,9 +790,6 @@ describe('api', function() { }); }); it('gets metadata', function(done) { - if (isNodeJS()) { - pending('Document is not supported in Node.js.'); - } var promise = doc.getMetadata(); promise.then(function(metadata) { expect(metadata.info['Title']).toEqual('Basic API Test'); diff --git a/test/unit/clitests.json b/test/unit/clitests.json index 46b115ada..5c27deeaa 100644 --- a/test/unit/clitests.json +++ b/test/unit/clitests.json @@ -14,6 +14,7 @@ "evaluator_spec.js", "fonts_spec.js", "function_spec.js", + "metadata_spec.js", "murmurhash3_spec.js", "node_stream_spec.js", "parser_spec.js", diff --git a/test/unit/metadata_spec.js b/test/unit/metadata_spec.js index 548bf4318..f7fa947fa 100644 --- a/test/unit/metadata_spec.js +++ b/test/unit/metadata_spec.js @@ -16,15 +16,37 @@ import { Metadata } from '../../src/display/metadata'; describe('metadata', function() { - describe('incorrect_xmp', function() { - it('should fix the incorrect XMP data', function() { - var invalidXMP = '' + - '' + - '' + - '\\376\\377\\000P\\000D\\000F\\000&' + - ''; - var meta = new Metadata(invalidXMP); - expect(meta.get('dc:title')).toEqual('PDF&'); - }); + it('should handle valid metadata', function() { + var validData = '' + + '' + + '' + + 'Foo bar baz' + + ''; + var metadata = new Metadata(validData); + + expect(metadata.has('dc:title')).toBeTruthy(); + expect(metadata.has('dc:qux')).toBeFalsy(); + + expect(metadata.get('dc:title')).toEqual('Foo bar baz'); + expect(metadata.get('dc:qux')).toEqual(null); + + expect(metadata.getAll()).toEqual({ 'dc:title': 'Foo bar baz', }); + }); + + it('should repair and handle invalid metadata', function() { + var invalidData = '' + + '' + + '' + + '\\376\\377\\000P\\000D\\000F\\000&' + + ''; + var metadata = new Metadata(invalidData); + + expect(metadata.has('dc:title')).toBeTruthy(); + expect(metadata.has('dc:qux')).toBeFalsy(); + + expect(metadata.get('dc:title')).toEqual('PDF&'); + expect(metadata.get('dc:qux')).toEqual(null); + + expect(metadata.getAll()).toEqual({ 'dc:title': 'PDF&', }); }); });