8ec99b200c
With the new XML parser, see PR 9573, the referenced PDF file now causes `getMetadata` to fail when incomplete XML tags are encountered. This provides a simple, and hopefully generally useful, work-around that may also help prevent future bugs. (Without being able to reproduce nor even understand the other (non XML) errors mentioned in issue 8884, I'd say that this patch is enough to close that one as fixed.)
378 lines
9.8 KiB
JavaScript
378 lines
9.8 KiB
JavaScript
/* Copyright 2018 Mozilla Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
// The code for XMLParserBase copied from
|
|
// https://github.com/mozilla/shumway/blob/16451d8836fa85f4b16eeda8b4bda2fa9e2b22b0/src/avm2/natives/xml.ts
|
|
|
|
const XMLParserErrorCode = {
|
|
NoError: 0,
|
|
EndOfDocument: -1,
|
|
UnterminatedCdat: -2,
|
|
UnterminatedXmlDeclaration: -3,
|
|
UnterminatedDoctypeDeclaration: -4,
|
|
UnterminatedComment: -5,
|
|
MalformedElement: -6,
|
|
OutOfMemory: -7,
|
|
UnterminatedAttributeValue: -8,
|
|
UnterminatedElement: -9,
|
|
ElementNeverBegun: -10,
|
|
};
|
|
|
|
function isWhitespace(s, index) {
|
|
const ch = s[index];
|
|
return ch === ' ' || ch === '\n' || ch === '\r' || ch === '\t';
|
|
}
|
|
|
|
function isWhitespaceString(s) {
|
|
for (let i = 0, ii = s.length; i < ii; i++) {
|
|
if (!isWhitespace(s, i)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
class XMLParserBase {
|
|
_resolveEntities(s) {
|
|
return s.replace(/&([^;]+);/g, function (all, entity) {
|
|
if (entity.substring(0, 2) === '#x') {
|
|
return String.fromCharCode(parseInt(entity.substring(2), 16));
|
|
} else if (entity.substring(0, 1) === '#') {
|
|
return String.fromCharCode(parseInt(entity.substring(1), 10));
|
|
}
|
|
switch (entity) {
|
|
case 'lt':
|
|
return '<';
|
|
case 'gt':
|
|
return '>';
|
|
case 'amp':
|
|
return '&';
|
|
case 'quot':
|
|
return '\"';
|
|
}
|
|
return this.onResolveEntity(entity);
|
|
});
|
|
}
|
|
|
|
_parseContent(s, start) {
|
|
let pos = start, name, attributes = [];
|
|
|
|
function skipWs() {
|
|
while (pos < s.length && isWhitespace(s, pos)) {
|
|
++pos;
|
|
}
|
|
}
|
|
|
|
while (pos < s.length && !isWhitespace(s, pos) &&
|
|
s[pos] !== '>' && s[pos] !== '/') {
|
|
++pos;
|
|
}
|
|
name = s.substring(start, pos);
|
|
skipWs();
|
|
while (pos < s.length && s[pos] !== '>' &&
|
|
s[pos] !== '/' && s[pos] !== '?') {
|
|
skipWs();
|
|
let attrName = '', attrValue = '';
|
|
while (pos < s.length && !isWhitespace(s, pos) && s[pos] !== '=') {
|
|
attrName += s[pos];
|
|
++pos;
|
|
}
|
|
skipWs();
|
|
if (s[pos] !== '=') {
|
|
return null;
|
|
}
|
|
++pos;
|
|
skipWs();
|
|
const attrEndChar = s[pos];
|
|
if (attrEndChar !== '\"' && attrEndChar !== '\'') {
|
|
return null;
|
|
}
|
|
const attrEndIndex = s.indexOf(attrEndChar, ++pos);
|
|
if (attrEndIndex < 0) {
|
|
return null;
|
|
}
|
|
attrValue = s.substring(pos, attrEndIndex);
|
|
attributes.push({
|
|
name: attrName,
|
|
value: this._resolveEntities(attrValue),
|
|
});
|
|
pos = attrEndIndex + 1;
|
|
skipWs();
|
|
}
|
|
return {
|
|
name,
|
|
attributes,
|
|
parsed: pos - start,
|
|
};
|
|
}
|
|
|
|
_parseProcessingInstruction(s, start) {
|
|
let pos = start, name, value;
|
|
|
|
function skipWs() {
|
|
while (pos < s.length && isWhitespace(s, pos)) {
|
|
++pos;
|
|
}
|
|
}
|
|
|
|
while (pos < s.length && !isWhitespace(s, pos) &&
|
|
s[pos] !== '>' && s[pos] !== '/') {
|
|
++pos;
|
|
}
|
|
name = s.substring(start, pos);
|
|
skipWs();
|
|
const attrStart = pos;
|
|
while (pos < s.length && (s[pos] !== '?' || s[pos + 1] !== '>')) {
|
|
++pos;
|
|
}
|
|
value = s.substring(attrStart, pos);
|
|
return {
|
|
name,
|
|
value,
|
|
parsed: pos - start,
|
|
};
|
|
}
|
|
|
|
parseXml(s) {
|
|
let i = 0;
|
|
while (i < s.length) {
|
|
const ch = s[i];
|
|
let j = i;
|
|
if (ch === '<') {
|
|
++j;
|
|
const ch2 = s[j];
|
|
let q;
|
|
switch (ch2) {
|
|
case '/':
|
|
++j;
|
|
q = s.indexOf('>', j);
|
|
if (q < 0) {
|
|
this.onError(XMLParserErrorCode.UnterminatedElement);
|
|
return;
|
|
}
|
|
this.onEndElement(s.substring(j, q));
|
|
j = q + 1;
|
|
break;
|
|
case '?':
|
|
++j;
|
|
const pi = this._parseProcessingInstruction(s, j);
|
|
if (s.substring(j + pi.parsed, j + pi.parsed + 2) !== '?>') {
|
|
this.onError(XMLParserErrorCode.UnterminatedXmlDeclaration);
|
|
return;
|
|
}
|
|
this.onPi(pi.name, pi.value);
|
|
j += pi.parsed + 2;
|
|
break;
|
|
case '!':
|
|
if (s.substring(j + 1, j + 3) === '--') {
|
|
q = s.indexOf('-->', j + 3);
|
|
if (q < 0) {
|
|
this.onError(XMLParserErrorCode.UnterminatedComment);
|
|
return;
|
|
}
|
|
this.onComment(s.substring(j + 3, q));
|
|
j = q + 3;
|
|
} else if (s.substring(j + 1, j + 8) === '[CDATA[') {
|
|
q = s.indexOf(']]>', j + 8);
|
|
if (q < 0) {
|
|
this.onError(XMLParserErrorCode.UnterminatedCdat);
|
|
return;
|
|
}
|
|
this.onCdata(s.substring(j + 8, q));
|
|
j = q + 3;
|
|
} else if (s.substring(j + 1, j + 8) === 'DOCTYPE') {
|
|
const q2 = s.indexOf('[', j + 8);
|
|
let complexDoctype = false;
|
|
q = s.indexOf('>', j + 8);
|
|
if (q < 0) {
|
|
this.onError(XMLParserErrorCode.UnterminatedDoctypeDeclaration);
|
|
return;
|
|
}
|
|
if (q2 > 0 && q > q2) {
|
|
q = s.indexOf(']>', j + 8);
|
|
if (q < 0) {
|
|
this.onError(
|
|
XMLParserErrorCode.UnterminatedDoctypeDeclaration);
|
|
return;
|
|
}
|
|
complexDoctype = true;
|
|
}
|
|
const doctypeContent =
|
|
s.substring(j + 8, q + (complexDoctype ? 1 : 0));
|
|
this.onDoctype(doctypeContent);
|
|
j = q + (complexDoctype ? 2 : 1);
|
|
} else {
|
|
this.onError(XMLParserErrorCode.MalformedElement);
|
|
return;
|
|
}
|
|
break;
|
|
default:
|
|
const content = this._parseContent(s, j);
|
|
if (content === null) {
|
|
this.onError(XMLParserErrorCode.MalformedElement);
|
|
return;
|
|
}
|
|
let isClosed = false;
|
|
if (s.substring(j + content.parsed,
|
|
j + content.parsed + 2) === '/>') {
|
|
isClosed = true;
|
|
} else if (s.substring(j + content.parsed,
|
|
j + content.parsed + 1) !== '>') {
|
|
this.onError(XMLParserErrorCode.UnterminatedElement);
|
|
return;
|
|
}
|
|
this.onBeginElement(content.name, content.attributes, isClosed);
|
|
j += content.parsed + (isClosed ? 2 : 1);
|
|
break;
|
|
}
|
|
} else {
|
|
while (j < s.length && s[j] !== '<') {
|
|
j++;
|
|
}
|
|
const text = s.substring(i, j);
|
|
this.onText(this._resolveEntities(text));
|
|
}
|
|
i = j;
|
|
}
|
|
}
|
|
|
|
onResolveEntity(name) {
|
|
return `&${name};`;
|
|
}
|
|
|
|
onPi(name, value) { }
|
|
|
|
onComment(text) { }
|
|
|
|
onCdata(text) { }
|
|
|
|
onDoctype(doctypeContent) { }
|
|
|
|
onText(text) { }
|
|
|
|
onBeginElement(name, attributes, isEmpty) { }
|
|
|
|
onEndElement(name) { }
|
|
|
|
onError(code) { }
|
|
}
|
|
|
|
class SimpleDOMNode {
|
|
constructor(nodeName, nodeValue) {
|
|
this.nodeName = nodeName;
|
|
this.nodeValue = nodeValue;
|
|
|
|
Object.defineProperty(this, 'parentNode', { value: null, writable: true, });
|
|
}
|
|
|
|
get firstChild() {
|
|
return this.childNodes[0];
|
|
}
|
|
|
|
get nextSibling() {
|
|
let index = this.parentNode.childNodes.indexOf(this);
|
|
return this.parentNode.childNodes[index + 1];
|
|
}
|
|
|
|
get textContent() {
|
|
if (!this.childNodes) {
|
|
return this.nodeValue || '';
|
|
}
|
|
return this.childNodes.map(function(child) {
|
|
return child.textContent;
|
|
}).join('');
|
|
}
|
|
|
|
hasChildNodes() {
|
|
return this.childNodes && this.childNodes.length > 0;
|
|
}
|
|
}
|
|
|
|
class SimpleXMLParser extends XMLParserBase {
|
|
constructor() {
|
|
super();
|
|
this._currentFragment = null;
|
|
this._stack = null;
|
|
this._errorCode = XMLParserErrorCode.NoError;
|
|
}
|
|
|
|
parseFromString(data) {
|
|
this._currentFragment = [];
|
|
this._stack = [];
|
|
this._errorCode = XMLParserErrorCode.NoError;
|
|
|
|
this.parseXml(data);
|
|
|
|
if (this._errorCode !== XMLParserErrorCode.NoError) {
|
|
return undefined; // return undefined on error
|
|
}
|
|
|
|
// We should only have one root.
|
|
const [documentElement] = this._currentFragment;
|
|
if (!documentElement) {
|
|
return undefined; // Return undefined if no root was found.
|
|
}
|
|
return { documentElement, };
|
|
}
|
|
|
|
onResolveEntity(name) {
|
|
switch (name) {
|
|
case 'apos':
|
|
return '\'';
|
|
}
|
|
return super.onResolveEntity(name);
|
|
}
|
|
|
|
onText(text) {
|
|
if (isWhitespaceString(text)) {
|
|
return;
|
|
}
|
|
const node = new SimpleDOMNode('#text', text);
|
|
this._currentFragment.push(node);
|
|
}
|
|
|
|
onCdata(text) {
|
|
const node = new SimpleDOMNode('#text', text);
|
|
this._currentFragment.push(node);
|
|
}
|
|
|
|
onBeginElement(name, attributes, isEmpty) {
|
|
const node = new SimpleDOMNode(name);
|
|
node.childNodes = [];
|
|
this._currentFragment.push(node);
|
|
if (isEmpty) {
|
|
return;
|
|
}
|
|
this._stack.push(this._currentFragment);
|
|
this._currentFragment = node.childNodes;
|
|
}
|
|
|
|
onEndElement(name) {
|
|
this._currentFragment = this._stack.pop();
|
|
const lastElement = this._currentFragment[this._currentFragment.length - 1];
|
|
for (let i = 0, ii = lastElement.childNodes.length; i < ii; i++) {
|
|
lastElement.childNodes[i].parentNode = lastElement;
|
|
}
|
|
}
|
|
|
|
onError(code) {
|
|
this._errorCode = code;
|
|
}
|
|
}
|
|
|
|
export {
|
|
SimpleXMLParser,
|
|
};
|