From 655c8d34d042966f080bcd7b8e6b9faca9bc5664 Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Fri, 16 Mar 2018 16:48:08 -0500 Subject: [PATCH] New XML parser --- src/display/dom_utils.js | 127 ------------- src/display/metadata.js | 14 +- src/display/xml_parser.js | 374 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 382 insertions(+), 133 deletions(-) create mode 100644 src/display/xml_parser.js diff --git a/src/display/dom_utils.js b/src/display/dom_utils.js index 565c9836a..adf0c8e71 100644 --- a/src/display/dom_utils.js +++ b/src/display/dom_utils.js @@ -135,132 +135,6 @@ class DOMSVGFactory { } } -class SimpleDOMNode { - constructor(nodeName, nodeValue) { - this.nodeName = nodeName; - this.nodeValue = nodeValue; - - Object.defineProperty(this, 'parentNode', { value: null, writable: true, }); - } - - get firstChild() { - return this.childNodes[0]; - } - - get nextSibling() { - let index = this.parentNode.childNodes.indexOf(this); - return this.parentNode.childNodes[index + 1]; - } - - get textContent() { - if (!this.childNodes) { - return this.nodeValue || ''; - } - return this.childNodes.map(function(child) { - return child.textContent; - }).join(''); - } - - hasChildNodes() { - return this.childNodes && this.childNodes.length > 0; - } -} - -class SimpleXMLParser { - parseFromString(data) { - let nodes = []; - - // Remove all comments and processing instructions. - data = data.replace(/<\?[\s\S]*?\?>|/g, '').trim(); - data = data.replace(/\[]+(\[[^\]]+)?[^>]+>/g, '').trim(); - - // Extract all text nodes and replace them with a numeric index in - // the nodes. - data = data.replace(/>([^<][\s\S]*?) { - let length = nodes.length; - let node = new SimpleDOMNode('#text', this._decodeXML(text)); - nodes.push(node); - if (node.textContent.trim().length === 0) { - return '><'; // Ignore whitespace. - } - return '>' + length + ',<'; - }); - - // Extract all CDATA nodes. - data = data.replace(//g, - function(all, text) { - let length = nodes.length; - let node = new SimpleDOMNode('#text', text); - nodes.push(node); - return length + ','; - }); - - // Until nodes without '<' and '>' content are present, replace them - // with a numeric index in the nodes. - let regex = - /<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g; - let lastLength; - do { - lastLength = nodes.length; - data = data.replace(regex, function(all, name, attrs, data) { - let length = nodes.length; - let node = new SimpleDOMNode(name); - let children = []; - if (data) { - data = data.split(','); - data.pop(); - data.forEach(function(child) { - let childNode = nodes[+child]; - childNode.parentNode = node; - children.push(childNode); - }); - } - - node.childNodes = children; - nodes.push(node); - return length + ','; - }); - } while (lastLength < nodes.length); - - // We should only have one root index left, which will be last in the nodes. - return { - documentElement: nodes.pop(), - }; - } - - _decodeXML(text) { - if (!text.includes('&')) { - return text; - } - - return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi, - function(all, entityName, number) { - if (number) { - if (number[0] === 'x') { - number = parseInt(number.substring(1), 16); - } else { - number = +number; - } - return String.fromCharCode(number); - } - - switch (entityName) { - case 'amp': - return '&'; - case 'lt': - return '<'; - case 'gt': - return '>'; - case 'quot': - return '\"'; - case 'apos': - return '\''; - } - return '&' + entityName + ';'; - }); - } -} - var RenderingCancelledException = (function RenderingCancelledException() { function RenderingCancelledException(msg, type) { this.message = msg; @@ -411,7 +285,6 @@ export { DOMCanvasFactory, DOMCMapReaderFactory, DOMSVGFactory, - SimpleXMLParser, StatTimer, DummyStatTimer, }; diff --git a/src/display/metadata.js b/src/display/metadata.js index af58fb5c3..5c7922bf4 100644 --- a/src/display/metadata.js +++ b/src/display/metadata.js @@ -14,7 +14,7 @@ */ import { assert } from '../shared/util'; -import { SimpleXMLParser } from './dom_utils'; +import { SimpleXMLParser } from './xml_parser'; class Metadata { constructor(data) { @@ -23,13 +23,15 @@ class Metadata { // Ghostscript may produce invalid metadata, so try to repair that first. data = this._repair(data); - // Convert the string to a DOM `Document`. + // Convert the string to an XML document. let parser = new SimpleXMLParser(); - data = parser.parseFromString(data); + const xmlDocument = parser.parseFromString(data); this._metadata = Object.create(null); - this._parse(data); + if (xmlDocument) { + this._parse(xmlDocument); + } } _repair(data) { @@ -68,8 +70,8 @@ class Metadata { }); } - _parse(domDocument) { - let rdf = domDocument.documentElement; + _parse(xmlDocument) { + let rdf = xmlDocument.documentElement; if (rdf.nodeName.toLowerCase() !== 'rdf:rdf') { // Wrapped in rdf = rdf.firstChild; diff --git a/src/display/xml_parser.js b/src/display/xml_parser.js new file mode 100644 index 000000000..207f928a6 --- /dev/null +++ b/src/display/xml_parser.js @@ -0,0 +1,374 @@ +/* Copyright 2018 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The code for XMLParserBase copied from +// https://github.com/mozilla/shumway/blob/16451d8836fa85f4b16eeda8b4bda2fa9e2b22b0/src/avm2/natives/xml.ts + +const XMLParserErrorCode = { + NoError: 0, + EndOfDocument: -1, + UnterminatedCdat: -2, + UnterminatedXmlDeclaration: -3, + UnterminatedDoctypeDeclaration: -4, + UnterminatedComment: -5, + MalformedElement: -6, + OutOfMemory: -7, + UnterminatedAttributeValue: -8, + UnterminatedElement: -9, + ElementNeverBegun: -10, +}; + +function isWhitespace(s, index) { + const ch = s[index]; + return ch === ' ' || ch === '\n' || ch === '\r' || ch === '\t'; +} + +function isWhitespaceString(s) { + for (let i = 0, ii = s.length; i < ii; i++) { + if (!isWhitespace(s, i)) { + return false; + } + } + return true; +} + +class XMLParserBase { + _resolveEntities(s) { + return s.replace(/&([^;]+);/g, function (all, entity) { + if (entity.substring(0, 2) === '#x') { + return String.fromCharCode(parseInt(entity.substring(2), 16)); + } else if (entity.substring(0, 1) === '#') { + return String.fromCharCode(parseInt(entity.substring(1), 10)); + } + switch (entity) { + case 'lt': + return '<'; + case 'gt': + return '>'; + case 'amp': + return '&'; + case 'quot': + return '\"'; + } + return this.onResolveEntity(entity); + }); + } + + _parseContent(s, start) { + let pos = start, name, attributes = []; + + function skipWs() { + while (pos < s.length && isWhitespace(s, pos)) { + ++pos; + } + } + + while (pos < s.length && !isWhitespace(s, pos) && + s[pos] !== '>' && s[pos] !== '/') { + ++pos; + } + name = s.substring(start, pos); + skipWs(); + while (pos < s.length && s[pos] !== '>' && + s[pos] !== '/' && s[pos] !== '?') { + skipWs(); + let attrName = '', attrValue = ''; + while (pos < s.length && !isWhitespace(s, pos) && s[pos] !== '=') { + attrName += s[pos]; + ++pos; + } + skipWs(); + if (s[pos] !== '=') { + return null; + } + ++pos; + skipWs(); + const attrEndChar = s[pos]; + if (attrEndChar !== '\"' && attrEndChar !== '\'') { + return null; + } + const attrEndIndex = s.indexOf(attrEndChar, ++pos); + if (attrEndIndex < 0) { + return null; + } + attrValue = s.substring(pos, attrEndIndex); + attributes.push({ + name: attrName, + value: this._resolveEntities(attrValue), + }); + pos = attrEndIndex + 1; + skipWs(); + } + return { + name, + attributes, + parsed: pos - start, + }; + } + + _parseProcessingInstruction(s, start) { + let pos = start, name, value; + + function skipWs() { + while (pos < s.length && isWhitespace(s, pos)) { + ++pos; + } + } + + while (pos < s.length && !isWhitespace(s, pos) && + s[pos] !== '>' && s[pos] !== '/') { + ++pos; + } + name = s.substring(start, pos); + skipWs(); + const attrStart = pos; + while (pos < s.length && (s[pos] !== '?' || s[pos + 1] !== '>')) { + ++pos; + } + value = s.substring(attrStart, pos); + return { + name, + value, + parsed: pos - start, + }; + } + + parseXml(s) { + let i = 0; + while (i < s.length) { + const ch = s[i]; + let j = i; + if (ch === '<') { + ++j; + const ch2 = s[j]; + let q; + switch (ch2) { + case '/': + ++j; + q = s.indexOf('>', j); + if (q < 0) { + this.onError(XMLParserErrorCode.UnterminatedElement); + return; + } + this.onEndElement(s.substring(j, q)); + j = q + 1; + break; + case '?': + ++j; + const pi = this._parseProcessingInstruction(s, j); + if (s.substring(j + pi.parsed, j + pi.parsed + 2) !== '?>') { + this.onError(XMLParserErrorCode.UnterminatedXmlDeclaration); + return; + } + this.onPi(pi.name, pi.value); + j += pi.parsed + 2; + break; + case '!': + if (s.substring(j + 1, j + 3) === '--') { + q = s.indexOf('-->', j + 3); + if (q < 0) { + this.onError(XMLParserErrorCode.UnterminatedComment); + return; + } + this.onComment(s.substring(j + 3, q)); + j = q + 3; + } else if (s.substring(j + 1, j + 8) === '[CDATA[') { + q = s.indexOf(']]>', j + 8); + if (q < 0) { + this.onError(XMLParserErrorCode.UnterminatedCdat); + return; + } + this.onCdata(s.substring(j + 8, q)); + j = q + 3; + } else if (s.substring(j + 1, j + 8) === 'DOCTYPE') { + const q2 = s.indexOf('[', j + 8); + let complexDoctype = false; + q = s.indexOf('>', j + 8); + if (q < 0) { + this.onError(XMLParserErrorCode.UnterminatedDoctypeDeclaration); + return; + } + if (q2 > 0 && q > q2) { + q = s.indexOf(']>', j + 8); + if (q < 0) { + this.onError( + XMLParserErrorCode.UnterminatedDoctypeDeclaration); + return; + } + complexDoctype = true; + } + const doctypeContent = + s.substring(j + 8, q + (complexDoctype ? 1 : 0)); + this.onDoctype(doctypeContent); + j = q + (complexDoctype ? 2 : 1); + } else { + this.onError(XMLParserErrorCode.MalformedElement); + return; + } + break; + default: + const content = this._parseContent(s, j); + if (content === null) { + this.onError(XMLParserErrorCode.MalformedElement); + return; + } + let isClosed = false; + if (s.substring(j + content.parsed, + j + content.parsed + 2) === '/>') { + isClosed = true; + } else if (s.substring(j + content.parsed, + j + content.parsed + 1) !== '>') { + this.onError(XMLParserErrorCode.UnterminatedElement); + return; + } + this.onBeginElement(content.name, content.attributes, isClosed); + j += content.parsed + (isClosed ? 2 : 1); + break; + } + } else { + while (j < s.length && s[j] !== '<') { + j++; + } + const text = s.substring(i, j); + this.onText(this._resolveEntities(text)); + } + i = j; + } + } + + onResolveEntity(name) { + return `&${name};`; + } + + onPi(name, value) { } + + onComment(text) { } + + onCdata(text) { } + + onDoctype(doctypeContent) { } + + onText(text) { } + + onBeginElement(name, attributes, isEmpty) { } + + onEndElement(name) { } + + onError(code) { } +} + +class SimpleDOMNode { + constructor(nodeName, nodeValue) { + this.nodeName = nodeName; + this.nodeValue = nodeValue; + + Object.defineProperty(this, 'parentNode', { value: null, writable: true, }); + } + + get firstChild() { + return this.childNodes[0]; + } + + get nextSibling() { + let index = this.parentNode.childNodes.indexOf(this); + return this.parentNode.childNodes[index + 1]; + } + + get textContent() { + if (!this.childNodes) { + return this.nodeValue || ''; + } + return this.childNodes.map(function(child) { + return child.textContent; + }).join(''); + } + + hasChildNodes() { + return this.childNodes && this.childNodes.length > 0; + } +} + +class SimpleXMLParser extends XMLParserBase { + constructor() { + super(); + this._currentFragment = null; + this._stack = null; + this._errorCode = XMLParserErrorCode.NoError; + } + + parseFromString(data) { + this._currentFragment = []; + this._stack = []; + this._errorCode = XMLParserErrorCode.NoError; + + this.parseXml(data); + + if (this._errorCode !== XMLParserErrorCode.NoError) { + return undefined; // return undefined on error + } + + // We should only have one root. + const [documentElement] = this._currentFragment; + return { documentElement, }; + } + + onResolveEntity(name) { + switch (name) { + case 'apos': + return '\''; + } + return super.onResolveEntity(name); + } + + onText(text) { + if (isWhitespaceString(text)) { + return; + } + const node = new SimpleDOMNode('#text', text); + this._currentFragment.push(node); + } + + onCdata(text) { + const node = new SimpleDOMNode('#text', text); + this._currentFragment.push(node); + } + + onBeginElement(name, attributes, isEmpty) { + const node = new SimpleDOMNode(name); + node.childNodes = []; + this._currentFragment.push(node); + if (isEmpty) { + return; + } + this._stack.push(this._currentFragment); + this._currentFragment = node.childNodes; + } + + onEndElement(name) { + this._currentFragment = this._stack.pop(); + const lastElement = this._currentFragment[this._currentFragment.length - 1]; + for (let i = 0, ii = lastElement.childNodes.length; i < ii; i++) { + lastElement.childNodes[i].parentNode = lastElement; + } + } + + onError(code) { + this._errorCode = code; + } +} + +export { + SimpleXMLParser, +};