Allow getOperatorList/getTextContent to skip errors when parsing broken XObjects (issue 8702, issue 8704)

This patch makes use of the existing `ignoreErrors` property in `src/core/evaluator.js`, see PRs 8240 and 8441, thus allowing us to attempt to recovery as much as possible of a page even when it contains broken XObjects.

Fixes 8702.
Fixes 8704.
This commit is contained in:
Jonas Jenwald 2017-09-17 13:35:18 +02:00
parent b3f8411264
commit b1472cddbb
5 changed files with 138 additions and 86 deletions

View File

@ -948,52 +948,65 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
case OPS.paintXObject:
// eagerly compile XForm objects
var name = args[0].name;
if (!name) {
warn('XObject must be referred to by name.');
continue;
}
if (imageCache[name] !== undefined) {
if (name && imageCache[name] !== undefined) {
operatorList.addOp(imageCache[name].fn, imageCache[name].args);
args = null;
continue;
}
var xobj = xobjs.get(name);
if (xobj) {
next(new Promise(function(resolveXObject, rejectXObject) {
if (!name) {
throw new FormatError('XObject must be referred to by name.');
}
let xobj = xobjs.get(name);
if (!xobj) {
operatorList.addOp(fn, args);
resolveXObject();
return;
}
if (!isStream(xobj)) {
throw new FormatError('XObject should be a stream');
}
var type = xobj.dict.get('Subtype');
let type = xobj.dict.get('Subtype');
if (!isName(type)) {
throw new FormatError('XObject should have a Name subtype');
}
if (type.name === 'Form') {
stateManager.save();
next(self.buildFormXObject(resources, xobj, null,
operatorList, task,
stateManager.state.clone()).
then(function () {
self.buildFormXObject(resources, xobj, null, operatorList,
task, stateManager.state.clone()).
then(function() {
stateManager.restore();
}));
resolveXObject();
}, rejectXObject);
return;
} else if (type.name === 'Image') {
self.buildPaintImageXObject(resources, xobj, false,
operatorList, name, imageCache);
args = null;
continue;
operatorList, name, imageCache);
} else if (type.name === 'PS') {
// PostScript XObjects are unused when viewing documents.
// See section 4.7.1 of Adobe's PDF reference.
info('Ignored XObject subtype PS');
continue;
} else {
throw new FormatError(
`Unhandled XObject subtype ${type.name}`);
}
}
break;
resolveXObject();
}).catch(function(reason) {
if (self.options.ignoreErrors) {
// Error(s) in the XObject -- sending unsupported feature
// notification and allow rendering to continue.
self.handler.send('UnsupportedFeature',
{ featureId: UNSUPPORTED_FEATURES.unknown, });
warn(`getOperatorList - ignoring XObject: "${reason}".`);
return;
}
throw reason;
}));
return;
case OPS.setFont:
var fontSize = args[1];
// eagerly collect all fonts
@ -1666,73 +1679,93 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
var name = args[0].name;
if (name in skipEmptyXObjs) {
if (name && skipEmptyXObjs[name] !== undefined) {
break;
}
var xobj = xobjs.get(name);
if (!xobj) {
break;
}
if (!isStream(xobj)) {
throw new FormatError('XObject should be a stream');
}
var type = xobj.dict.get('Subtype');
if (!isName(type)) {
throw new FormatError('XObject should have a Name subtype');
}
if (type.name !== 'Form') {
skipEmptyXObjs[name] = true;
break;
}
// Use a new `StateManager` to prevent incorrect positioning of
// textItems *after* the Form XObject, since errors in the data
// can otherwise prevent `restore` operators from being executed.
// NOTE: This is only an issue when `options.ignoreErrors = true`.
var currentState = stateManager.state.clone();
var xObjStateManager = new StateManager(currentState);
var matrix = xobj.dict.getArray('Matrix');
if (Array.isArray(matrix) && matrix.length === 6) {
xObjStateManager.transform(matrix);
}
// Enqueue the `textContent` chunk before parsing the /Form
// XObject.
enqueueChunk();
let sinkWrapper = {
enqueueInvoked: false,
enqueue(chunk, size) {
this.enqueueInvoked = true;
sink.enqueue(chunk, size);
},
get desiredSize() {
return sink.desiredSize;
},
get ready() {
return sink.ready;
},
};
next(self.getTextContent({
stream: xobj,
task,
resources: xobj.dict.get('Resources') || resources,
stateManager: xObjStateManager,
normalizeWhitespace,
combineTextItems,
sink: sinkWrapper,
seenStyles,
}).then(function() {
if (!sinkWrapper.enqueueInvoked) {
skipEmptyXObjs[name] = true;
next(new Promise(function(resolveXObject, rejectXObject) {
if (!name) {
throw new FormatError('XObject must be referred to by name.');
}
let xobj = xobjs.get(name);
if (!xobj) {
resolveXObject();
return;
}
if (!isStream(xobj)) {
throw new FormatError('XObject should be a stream');
}
let type = xobj.dict.get('Subtype');
if (!isName(type)) {
throw new FormatError('XObject should have a Name subtype');
}
if (type.name !== 'Form') {
skipEmptyXObjs[name] = true;
resolveXObject();
return;
}
// Use a new `StateManager` to prevent incorrect positioning of
// textItems *after* the Form XObject, since errors in the data
// can otherwise prevent `restore` operators from executing.
// NOTE: Only an issue when `options.ignoreErrors === true`.
let currentState = stateManager.state.clone();
let xObjStateManager = new StateManager(currentState);
let matrix = xobj.dict.getArray('Matrix');
if (Array.isArray(matrix) && matrix.length === 6) {
xObjStateManager.transform(matrix);
}
// Enqueue the `textContent` chunk before parsing the /Form
// XObject.
enqueueChunk();
let sinkWrapper = {
enqueueInvoked: false,
enqueue(chunk, size) {
this.enqueueInvoked = true;
sink.enqueue(chunk, size);
},
get desiredSize() {
return sink.desiredSize;
},
get ready() {
return sink.ready;
},
};
self.getTextContent({
stream: xobj,
task,
resources: xobj.dict.get('Resources') || resources,
stateManager: xObjStateManager,
normalizeWhitespace,
combineTextItems,
sink: sinkWrapper,
seenStyles,
}).then(function() {
if (!sinkWrapper.enqueueInvoked) {
skipEmptyXObjs[name] = true;
}
resolveXObject();
}, rejectXObject);
}).catch(function(reason) {
if (reason instanceof AbortException) {
return;
}
if (self.options.ignoreErrors) {
// Error(s) in the XObject -- allow text-extraction to
// continue.
warn(`getTextContent - ignoring XObject: "${reason}".`);
return;
}
throw reason;
}));
return;
case OPS.setGState:

View File

@ -57,6 +57,7 @@
!issue8480.pdf
!issue8570.pdf
!issue8697.pdf
!issue8702.pdf
!issue8707.pdf
!issue8798r.pdf
!issue8823.pdf

BIN
test/pdfs/issue8702.pdf Normal file

Binary file not shown.

View File

@ -1640,6 +1640,22 @@
"lastPage": 1,
"type": "load"
},
{ "id": "issue8702-eq",
"file": "pdfs/issue8702.pdf",
"md5": "59d501ed1518d78ef6ee442cf824b0f6",
"rounds": 1,
"link": false,
"lastPage": 1,
"type": "eq"
},
{ "id": "issue8702-text",
"file": "pdfs/issue8702.pdf",
"md5": "59d501ed1518d78ef6ee442cf824b0f6",
"rounds": 1,
"link": false,
"lastPage": 1,
"type": "text"
},
{ "id": "pr4897",
"file": "pdfs/pr4897.pdf",
"md5": "26897633eea5e6d10345a130b1c1777c",

View File

@ -14,9 +14,9 @@
*/
import { Dict, Name } from '../../src/core/primitives';
import { FormatError, OPS } from '../../src/shared/util';
import { OperatorList, PartialEvaluator } from '../../src/core/evaluator';
import { Stream, StringStream } from '../../src/core/stream';
import { OPS } from '../../src/shared/util';
import { WorkerTask } from '../../src/core/worker';
import { XRefMock } from './test_utils';
@ -48,6 +48,8 @@ describe('evaluator', function() {
operatorList: result,
}).then(function() {
callback(result);
}, function(reason) {
callback(reason);
});
}
@ -229,9 +231,9 @@ describe('evaluator', function() {
it('should skip paintXObject if name is missing', function(done) {
var stream = new StringStream('/ Do');
runOperatorListCheck(partialEvaluator, stream, new ResourcesMock(),
function (result) {
expect(result.argsArray).toEqual([]);
expect(result.fnArray).toEqual([]);
function(result) {
expect(result instanceof FormatError).toEqual(true);
expect(result.message).toEqual('XObject must be referred to by name.');
done();
});
});