Merge pull request #8912 from timvandermeij/xml-parser

[api-minor] Replace `DOMParser` with `SimpleXMLParser`
2017-09-20 23:45:00 +02:00 · 2017-09-20 23:45:00 +02:00 · d7b37ae745
commit d7b37ae745
parent abc864fca9 2281061882
7 changed files with 226 additions and 171 deletions
--- a/examples/node/domparsermock.js
+++ b/examples/node/domparsermock.js
@ -1,105 +0,0 @@
-/* Any copyright is dedicated to the Public Domain.
- * http://creativecommons.org/publicdomain/zero/1.0/ */
-
-// Dummy XML Parser
-
-function DOMNodeMock(nodeName, nodeValue) {
-  this.nodeName = nodeName;
-  this.nodeValue = nodeValue;
-  Object.defineProperty(this, 'parentNode', {value: null, writable: true});
-}
-DOMNodeMock.prototype = {
-  get firstChild() {
-    return this.childNodes[0];
-  },
-  get nextSibling() {
-    var index = this.parentNode.childNodes.indexOf(this);
-    return this.parentNode.childNodes[index + 1];
-  },
-  get textContent() {
-    if (!this.childNodes) {
-      return this.nodeValue || '';
-    }
-    return this.childNodes.map(function (child) {
-      return child.textContent;
-    }).join('');
-  },
-  hasChildNodes: function () {
-    return this.childNodes && this.childNodes.length > 0;
-  }
-};
-
-function decodeXML(text) {
-  if (text.indexOf('&') < 0) {
-    return text;
-  }
-  return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi, function (all, entityName, number) {
-    if (number) {
-      return String.fromCharCode(number[0] === 'x' ? parseInt(number.substring(1), 16) : +number);
-    }
-    switch (entityName) {
-      case 'amp':
-        return '&';
-      case 'lt':
-        return '<';
-      case 'gt':
-        return '>';
-      case 'quot':
-        return '\"';
-      case 'apos':
-        return '\'';
-    }
-    return '&' + entityName + ';';
-  });
-}
-
-function DOMParserMock() {};
-DOMParserMock.prototype = {
-  parseFromString: function (content) {
-    content = content.replace(/<\?[\s\S]*?\?>|<!--[\s\S]*?-->/g, '').trim();
-    var nodes = [];
-    content = content.replace(/>([\s\S]+?)</g, function (all, text) {
-      var i = nodes.length;
-      var node = new DOMNodeMock('#text', decodeXML(text));
-      nodes.push(node);
-      if (node.textContent.trim().length === 0) {
-        return '><'; // ignoring whitespaces
-      }
-      return '>' + i + ',<';
-    });
-    content = content.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, function (all, text) {
-      var i = nodes.length;
-      var node = new DOMNodeMock('#text', text);
-      nodes.push(node);
-      return i + ',';
-    });
-    var lastLength;
-    do {
-      lastLength = nodes.length;
-      content = content.replace(/<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g,
-        function (all, name, attrs, content) {
-        var i = nodes.length;
-        var node = new DOMNodeMock(name);
-        var children = [];
-        if (content) {
-          content = content.split(',');
-          content.pop();
-          content.forEach(function (child) {
-            var childNode = nodes[+child];
-            childNode.parentNode = node;
-            children.push(childNode);
-          })
-        }
-        node.childNodes = children;
-        nodes.push(node);
-        return i + ',';
-
-      });
-    } while(lastLength < nodes.length);
-    return {
-      documentElement: nodes.pop()
-    };
-  }
-};
-
-exports.DOMParserMock = DOMParserMock;
--- a/examples/node/getinfo.js
+++ b/examples/node/getinfo.js
@ -9,9 +9,6 @@

 var fs = require('fs');

-// HACK adding DOMParser to read XMP metadata.
-global.DOMParser = require('./domparsermock.js').DOMParserMock;
-
 // Run `gulp dist-install` to generate 'pdfjs-dist' npm package files.
 var pdfjsLib = require('pdfjs-dist');

@ -34,7 +31,7 @@ pdfjsLib.getDocument(pdfPath).then(function (doc) {
    console.log();
    if (data.metadata) {
      console.log('## Metadata');
-      console.log(JSON.stringify(data.metadata.metadata, null, 2));
+      console.log(JSON.stringify(data.metadata.getAll(), null, 2));
      console.log();
    }
  });
--- a/src/display/dom_utils.js
+++ b/src/display/dom_utils.js
@ -131,6 +131,132 @@ class DOMSVGFactory {
  }
 }

+class SimpleDOMNode {
+  constructor(nodeName, nodeValue) {
+    this.nodeName = nodeName;
+    this.nodeValue = nodeValue;
+
+    Object.defineProperty(this, 'parentNode', { value: null, writable: true, });
+  }
+
+  get firstChild() {
+    return this.childNodes[0];
+  }
+
+  get nextSibling() {
+    let index = this.parentNode.childNodes.indexOf(this);
+    return this.parentNode.childNodes[index + 1];
+  }
+
+  get textContent() {
+    if (!this.childNodes) {
+      return this.nodeValue || '';
+    }
+    return this.childNodes.map(function(child) {
+      return child.textContent;
+    }).join('');
+  }
+
+  hasChildNodes() {
+    return this.childNodes && this.childNodes.length > 0;
+  }
+}
+
+class SimpleXMLParser {
+  parseFromString(data) {
+    let nodes = [];
+
+    // Remove all comments and processing instructions.
+    data = data.replace(/<\?[\s\S]*?\?>|<!--[\s\S]*?-->/g, '').trim();
+    data = data.replace(/<!DOCTYPE[^>\[]+(\[[^\]]+)?[^>]+>/g, '').trim();
+
+    // Extract all text nodes and replace them with a numeric index in
+    // the nodes.
+    data = data.replace(/>([^<][\s\S]*?)</g, (all, text) => {
+      let length = nodes.length;
+      let node = new SimpleDOMNode('#text', this._decodeXML(text));
+      nodes.push(node);
+      if (node.textContent.trim().length === 0) {
+        return '><'; // Ignore whitespace.
+      }
+      return '>' + length + ',<';
+    });
+
+    // Extract all CDATA nodes.
+    data = data.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g,
+        function(all, text) {
+      let length = nodes.length;
+      let node = new SimpleDOMNode('#text', text);
+      nodes.push(node);
+      return length + ',';
+    });
+
+    // Until nodes without '<' and '>' content are present, replace them
+    // with a numeric index in the nodes.
+    let regex =
+      /<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g;
+    let lastLength;
+    do {
+      lastLength = nodes.length;
+      data = data.replace(regex, function(all, name, attrs, data) {
+        let length = nodes.length;
+        let node = new SimpleDOMNode(name);
+        let children = [];
+        if (data) {
+          data = data.split(',');
+          data.pop();
+          data.forEach(function(child) {
+            let childNode = nodes[+child];
+            childNode.parentNode = node;
+            children.push(childNode);
+          });
+        }
+
+        node.childNodes = children;
+        nodes.push(node);
+        return length + ',';
+      });
+    } while (lastLength < nodes.length);
+
+    // We should only have one root index left, which will be last in the nodes.
+    return {
+      documentElement: nodes.pop(),
+    };
+  }
+
+  _decodeXML(text) {
+    if (text.indexOf('&') < 0) {
+      return text;
+    }
+
+    return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi,
+        function(all, entityName, number) {
+      if (number) {
+        if (number[0] === 'x') {
+          number = parseInt(number.substring(1), 16);
+        } else {
+          number = +number;
+        }
+        return String.fromCharCode(number);
+      }
+
+      switch (entityName) {
+        case 'amp':
+          return '&';
+        case 'lt':
+          return '<';
+        case 'gt':
+          return '>';
+        case 'quot':
+          return '\"';
+        case 'apos':
+          return '\'';
+      }
+      return '&' + entityName + ';';
+    });
+  }
+}
+
 /**
 * Optimised CSS custom property getter/setter.
 * @class
@ -353,4 +479,5 @@ export {
  DOMCanvasFactory,
  DOMCMapReaderFactory,
  DOMSVGFactory,
+  SimpleXMLParser,
 };
--- a/src/display/metadata.js
+++ b/src/display/metadata.js
@ -13,43 +13,49 @@
 * limitations under the License.
 */

-function fixMetadata(meta) {
-  return meta.replace(/>\\376\\377([^<]+)/g, function(all, codes) {
-    var bytes = codes.replace(/\\([0-3])([0-7])([0-7])/g,
-                              function(code, d1, d2, d3) {
-      return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
-    });
-    var chars = '';
-    for (var i = 0; i < bytes.length; i += 2) {
-      var code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
-      chars += (code >= 32 && code < 127 && code !== 60 && code !== 62 &&
-        code !== 38) ? String.fromCharCode(code) :
-        '&#x' + (0x10000 + code).toString(16).substring(1) + ';';
-    }
-    return '>' + chars;
-  });
-}
+import { assert, deprecated } from '../shared/util';
+import { SimpleXMLParser } from './dom_utils';

-function Metadata(meta) {
-  if (typeof meta === 'string') {
-    // Ghostscript produces invalid metadata
-    meta = fixMetadata(meta);
+class Metadata {
+  constructor(data) {
+    assert(typeof data === 'string', 'Metadata: input is not a string');

-    var parser = new DOMParser();
-    meta = parser.parseFromString(meta, 'application/xml');
-  } else if (!(meta instanceof Document)) {
-    throw new Error('Metadata: Invalid metadata object');
+    // Ghostscript may produce invalid metadata, so try to repair that first.
+    data = this._repair(data);
+
+    // Convert the string to a DOM `Document`.
+    let parser = new SimpleXMLParser();
+    data = parser.parseFromString(data);
+
+    this._metadata = Object.create(null);
+
+    this._parse(data);
  }

-  this.metaDocument = meta;
-  this.metadata = Object.create(null);
-  this.parse();
-}
+  _repair(data) {
+    return data.replace(/>\\376\\377([^<]+)/g, function(all, codes) {
+      let bytes = codes.replace(/\\([0-3])([0-7])([0-7])/g,
+          function(code, d1, d2, d3) {
+        return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
+      });

-Metadata.prototype = {
-  parse: function Metadata_parse() {
-    var doc = this.metaDocument;
-    var rdf = doc.documentElement;
+      let chars = '';
+      for (let i = 0, ii = bytes.length; i < ii; i += 2) {
+        let code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
+        if (code >= 32 && code < 127 && code !== 60 && code !== 62 &&
+            code !== 38) {
+          chars += String.fromCharCode(code);
+        } else {
+          chars += '&#x' + (0x10000 + code).toString(16).substring(1) + ';';
+        }
+      }
+
+      return '>' + chars;
+    });
+  }
+
+  _parse(domDocument) {
+    let rdf = domDocument.documentElement;

    if (rdf.nodeName.toLowerCase() !== 'rdf:rdf') { // Wrapped in <xmpmeta>
      rdf = rdf.firstChild;
@ -58,36 +64,46 @@ Metadata.prototype = {
      }
    }

-    var nodeName = (rdf) ? rdf.nodeName.toLowerCase() : null;
+    let nodeName = rdf ? rdf.nodeName.toLowerCase() : null;
    if (!rdf || nodeName !== 'rdf:rdf' || !rdf.hasChildNodes()) {
      return;
    }

-    var children = rdf.childNodes, desc, entry, name, i, ii, length, iLength;
-    for (i = 0, length = children.length; i < length; i++) {
-      desc = children[i];
+    let children = rdf.childNodes;
+    for (let i = 0, ii = children.length; i < ii; i++) {
+      let desc = children[i];
      if (desc.nodeName.toLowerCase() !== 'rdf:description') {
        continue;
      }

-      for (ii = 0, iLength = desc.childNodes.length; ii < iLength; ii++) {
-        if (desc.childNodes[ii].nodeName.toLowerCase() !== '#text') {
-          entry = desc.childNodes[ii];
-          name = entry.nodeName.toLowerCase();
-          this.metadata[name] = entry.textContent.trim();
+      for (let j = 0, jj = desc.childNodes.length; j < jj; j++) {
+        if (desc.childNodes[j].nodeName.toLowerCase() !== '#text') {
+          let entry = desc.childNodes[j];
+          let name = entry.nodeName.toLowerCase();
+
+          this._metadata[name] = entry.textContent.trim();
        }
      }
    }
-  },
+  }

-  get: function Metadata_get(name) {
-    return this.metadata[name] || null;
-  },
+  get(name) {
+    return this._metadata[name] || null;
+  }

-  has: function Metadata_has(name) {
-    return typeof this.metadata[name] !== 'undefined';
-  },
-};
+  getAll() {
+    return this._metadata;
+  }
+
+  has(name) {
+    return typeof this._metadata[name] !== 'undefined';
+  }
+
+  get metadata() {
+    deprecated('`metadata` getter; use `getAll()` instead.');
+    return this.getAll();
+  }
+}

 export {
  Metadata,
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -790,9 +790,6 @@ describe('api', function() {
      });
    });
    it('gets metadata', function(done) {
-      if (isNodeJS()) {
-        pending('Document is not supported in Node.js.');
-      }
      var promise = doc.getMetadata();
      promise.then(function(metadata) {
        expect(metadata.info['Title']).toEqual('Basic API Test');
--- a/test/unit/clitests.json
+++ b/test/unit/clitests.json
@ -14,6 +14,7 @@
    "evaluator_spec.js",
    "fonts_spec.js",
    "function_spec.js",
+    "metadata_spec.js",
    "murmurhash3_spec.js",
    "node_stream_spec.js",
    "parser_spec.js",
--- a/test/unit/metadata_spec.js
+++ b/test/unit/metadata_spec.js
@ -16,15 +16,37 @@
 import { Metadata } from '../../src/display/metadata';

 describe('metadata', function() {
-  describe('incorrect_xmp', function() {
-    it('should fix the incorrect XMP data', function() {
-      var invalidXMP = '<x:xmpmeta xmlns:x=\'adobe:ns:meta/\'>' +
-        '<rdf:RDF xmlns:rdf=\'http://www.w3.org/1999/02/22-rdf-syntax-ns#\'>' +
-        '<rdf:Description xmlns:dc=\'http://purl.org/dc/elements/1.1/\'>' +
-        '<dc:title>\\376\\377\\000P\\000D\\000F\\000&</dc:title>' +
-        '</rdf:Description></rdf:RDF></x:xmpmeta>';
-      var meta = new Metadata(invalidXMP);
-      expect(meta.get('dc:title')).toEqual('PDF&');
-    });
+  it('should handle valid metadata', function() {
+    var validData = '<x:xmpmeta xmlns:x=\'adobe:ns:meta/\'>' +
+      '<rdf:RDF xmlns:rdf=\'http://www.w3.org/1999/02/22-rdf-syntax-ns#\'>' +
+      '<rdf:Description xmlns:dc=\'http://purl.org/dc/elements/1.1/\'>' +
+      '<dc:title><rdf:Alt><rdf:li xml:lang="x-default">Foo bar baz</rdf:li>' +
+      '</rdf:Alt></dc:title></rdf:Description></rdf:RDF></x:xmpmeta>';
+    var metadata = new Metadata(validData);
+
+    expect(metadata.has('dc:title')).toBeTruthy();
+    expect(metadata.has('dc:qux')).toBeFalsy();
+
+    expect(metadata.get('dc:title')).toEqual('Foo bar baz');
+    expect(metadata.get('dc:qux')).toEqual(null);
+
+    expect(metadata.getAll()).toEqual({ 'dc:title': 'Foo bar baz', });
+  });
+
+  it('should repair and handle invalid metadata', function() {
+    var invalidData = '<x:xmpmeta xmlns:x=\'adobe:ns:meta/\'>' +
+      '<rdf:RDF xmlns:rdf=\'http://www.w3.org/1999/02/22-rdf-syntax-ns#\'>' +
+      '<rdf:Description xmlns:dc=\'http://purl.org/dc/elements/1.1/\'>' +
+      '<dc:title>\\376\\377\\000P\\000D\\000F\\000&</dc:title>' +
+      '</rdf:Description></rdf:RDF></x:xmpmeta>';
+    var metadata = new Metadata(invalidData);
+
+    expect(metadata.has('dc:title')).toBeTruthy();
+    expect(metadata.has('dc:qux')).toBeFalsy();
+
+    expect(metadata.get('dc:title')).toEqual('PDF&');
+    expect(metadata.get('dc:qux')).toEqual(null);
+
+    expect(metadata.getAll()).toEqual({ 'dc:title': 'PDF&', });
  });
 });