Allow getOperatorList
/getTextContent
to skip errors when parsing broken XObjects (issue 8702, issue 8704)
This patch makes use of the existing `ignoreErrors` property in `src/core/evaluator.js`, see PRs 8240 and 8441, thus allowing us to attempt to recovery as much as possible of a page even when it contains broken XObjects. Fixes 8702. Fixes 8704.
This commit is contained in:
parent
b3f8411264
commit
b1472cddbb
@ -948,52 +948,65 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
case OPS.paintXObject:
|
||||
// eagerly compile XForm objects
|
||||
var name = args[0].name;
|
||||
if (!name) {
|
||||
warn('XObject must be referred to by name.');
|
||||
continue;
|
||||
}
|
||||
if (imageCache[name] !== undefined) {
|
||||
if (name && imageCache[name] !== undefined) {
|
||||
operatorList.addOp(imageCache[name].fn, imageCache[name].args);
|
||||
args = null;
|
||||
continue;
|
||||
}
|
||||
|
||||
var xobj = xobjs.get(name);
|
||||
if (xobj) {
|
||||
next(new Promise(function(resolveXObject, rejectXObject) {
|
||||
if (!name) {
|
||||
throw new FormatError('XObject must be referred to by name.');
|
||||
}
|
||||
|
||||
let xobj = xobjs.get(name);
|
||||
if (!xobj) {
|
||||
operatorList.addOp(fn, args);
|
||||
resolveXObject();
|
||||
return;
|
||||
}
|
||||
if (!isStream(xobj)) {
|
||||
throw new FormatError('XObject should be a stream');
|
||||
}
|
||||
|
||||
var type = xobj.dict.get('Subtype');
|
||||
let type = xobj.dict.get('Subtype');
|
||||
if (!isName(type)) {
|
||||
throw new FormatError('XObject should have a Name subtype');
|
||||
}
|
||||
|
||||
if (type.name === 'Form') {
|
||||
stateManager.save();
|
||||
next(self.buildFormXObject(resources, xobj, null,
|
||||
operatorList, task,
|
||||
stateManager.state.clone()).
|
||||
then(function () {
|
||||
self.buildFormXObject(resources, xobj, null, operatorList,
|
||||
task, stateManager.state.clone()).
|
||||
then(function() {
|
||||
stateManager.restore();
|
||||
}));
|
||||
resolveXObject();
|
||||
}, rejectXObject);
|
||||
return;
|
||||
} else if (type.name === 'Image') {
|
||||
self.buildPaintImageXObject(resources, xobj, false,
|
||||
operatorList, name, imageCache);
|
||||
args = null;
|
||||
continue;
|
||||
operatorList, name, imageCache);
|
||||
} else if (type.name === 'PS') {
|
||||
// PostScript XObjects are unused when viewing documents.
|
||||
// See section 4.7.1 of Adobe's PDF reference.
|
||||
info('Ignored XObject subtype PS');
|
||||
continue;
|
||||
} else {
|
||||
throw new FormatError(
|
||||
`Unhandled XObject subtype ${type.name}`);
|
||||
}
|
||||
}
|
||||
break;
|
||||
resolveXObject();
|
||||
}).catch(function(reason) {
|
||||
if (self.options.ignoreErrors) {
|
||||
// Error(s) in the XObject -- sending unsupported feature
|
||||
// notification and allow rendering to continue.
|
||||
self.handler.send('UnsupportedFeature',
|
||||
{ featureId: UNSUPPORTED_FEATURES.unknown, });
|
||||
warn(`getOperatorList - ignoring XObject: "${reason}".`);
|
||||
return;
|
||||
}
|
||||
throw reason;
|
||||
}));
|
||||
return;
|
||||
case OPS.setFont:
|
||||
var fontSize = args[1];
|
||||
// eagerly collect all fonts
|
||||
@ -1666,73 +1679,93 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
}
|
||||
|
||||
var name = args[0].name;
|
||||
if (name in skipEmptyXObjs) {
|
||||
if (name && skipEmptyXObjs[name] !== undefined) {
|
||||
break;
|
||||
}
|
||||
|
||||
var xobj = xobjs.get(name);
|
||||
if (!xobj) {
|
||||
break;
|
||||
}
|
||||
if (!isStream(xobj)) {
|
||||
throw new FormatError('XObject should be a stream');
|
||||
}
|
||||
|
||||
var type = xobj.dict.get('Subtype');
|
||||
if (!isName(type)) {
|
||||
throw new FormatError('XObject should have a Name subtype');
|
||||
}
|
||||
|
||||
if (type.name !== 'Form') {
|
||||
skipEmptyXObjs[name] = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Use a new `StateManager` to prevent incorrect positioning of
|
||||
// textItems *after* the Form XObject, since errors in the data
|
||||
// can otherwise prevent `restore` operators from being executed.
|
||||
// NOTE: This is only an issue when `options.ignoreErrors = true`.
|
||||
var currentState = stateManager.state.clone();
|
||||
var xObjStateManager = new StateManager(currentState);
|
||||
|
||||
var matrix = xobj.dict.getArray('Matrix');
|
||||
if (Array.isArray(matrix) && matrix.length === 6) {
|
||||
xObjStateManager.transform(matrix);
|
||||
}
|
||||
|
||||
// Enqueue the `textContent` chunk before parsing the /Form
|
||||
// XObject.
|
||||
enqueueChunk();
|
||||
let sinkWrapper = {
|
||||
enqueueInvoked: false,
|
||||
|
||||
enqueue(chunk, size) {
|
||||
this.enqueueInvoked = true;
|
||||
sink.enqueue(chunk, size);
|
||||
},
|
||||
|
||||
get desiredSize() {
|
||||
return sink.desiredSize;
|
||||
},
|
||||
|
||||
get ready() {
|
||||
return sink.ready;
|
||||
},
|
||||
};
|
||||
|
||||
next(self.getTextContent({
|
||||
stream: xobj,
|
||||
task,
|
||||
resources: xobj.dict.get('Resources') || resources,
|
||||
stateManager: xObjStateManager,
|
||||
normalizeWhitespace,
|
||||
combineTextItems,
|
||||
sink: sinkWrapper,
|
||||
seenStyles,
|
||||
}).then(function() {
|
||||
if (!sinkWrapper.enqueueInvoked) {
|
||||
skipEmptyXObjs[name] = true;
|
||||
next(new Promise(function(resolveXObject, rejectXObject) {
|
||||
if (!name) {
|
||||
throw new FormatError('XObject must be referred to by name.');
|
||||
}
|
||||
|
||||
let xobj = xobjs.get(name);
|
||||
if (!xobj) {
|
||||
resolveXObject();
|
||||
return;
|
||||
}
|
||||
if (!isStream(xobj)) {
|
||||
throw new FormatError('XObject should be a stream');
|
||||
}
|
||||
|
||||
let type = xobj.dict.get('Subtype');
|
||||
if (!isName(type)) {
|
||||
throw new FormatError('XObject should have a Name subtype');
|
||||
}
|
||||
|
||||
if (type.name !== 'Form') {
|
||||
skipEmptyXObjs[name] = true;
|
||||
resolveXObject();
|
||||
return;
|
||||
}
|
||||
|
||||
// Use a new `StateManager` to prevent incorrect positioning of
|
||||
// textItems *after* the Form XObject, since errors in the data
|
||||
// can otherwise prevent `restore` operators from executing.
|
||||
// NOTE: Only an issue when `options.ignoreErrors === true`.
|
||||
let currentState = stateManager.state.clone();
|
||||
let xObjStateManager = new StateManager(currentState);
|
||||
|
||||
let matrix = xobj.dict.getArray('Matrix');
|
||||
if (Array.isArray(matrix) && matrix.length === 6) {
|
||||
xObjStateManager.transform(matrix);
|
||||
}
|
||||
|
||||
// Enqueue the `textContent` chunk before parsing the /Form
|
||||
// XObject.
|
||||
enqueueChunk();
|
||||
let sinkWrapper = {
|
||||
enqueueInvoked: false,
|
||||
|
||||
enqueue(chunk, size) {
|
||||
this.enqueueInvoked = true;
|
||||
sink.enqueue(chunk, size);
|
||||
},
|
||||
|
||||
get desiredSize() {
|
||||
return sink.desiredSize;
|
||||
},
|
||||
|
||||
get ready() {
|
||||
return sink.ready;
|
||||
},
|
||||
};
|
||||
|
||||
self.getTextContent({
|
||||
stream: xobj,
|
||||
task,
|
||||
resources: xobj.dict.get('Resources') || resources,
|
||||
stateManager: xObjStateManager,
|
||||
normalizeWhitespace,
|
||||
combineTextItems,
|
||||
sink: sinkWrapper,
|
||||
seenStyles,
|
||||
}).then(function() {
|
||||
if (!sinkWrapper.enqueueInvoked) {
|
||||
skipEmptyXObjs[name] = true;
|
||||
}
|
||||
resolveXObject();
|
||||
}, rejectXObject);
|
||||
}).catch(function(reason) {
|
||||
if (reason instanceof AbortException) {
|
||||
return;
|
||||
}
|
||||
if (self.options.ignoreErrors) {
|
||||
// Error(s) in the XObject -- allow text-extraction to
|
||||
// continue.
|
||||
warn(`getTextContent - ignoring XObject: "${reason}".`);
|
||||
return;
|
||||
}
|
||||
throw reason;
|
||||
}));
|
||||
return;
|
||||
case OPS.setGState:
|
||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -57,6 +57,7 @@
|
||||
!issue8480.pdf
|
||||
!issue8570.pdf
|
||||
!issue8697.pdf
|
||||
!issue8702.pdf
|
||||
!issue8707.pdf
|
||||
!issue8798r.pdf
|
||||
!issue8823.pdf
|
||||
|
BIN
test/pdfs/issue8702.pdf
Normal file
BIN
test/pdfs/issue8702.pdf
Normal file
Binary file not shown.
@ -1640,6 +1640,22 @@
|
||||
"lastPage": 1,
|
||||
"type": "load"
|
||||
},
|
||||
{ "id": "issue8702-eq",
|
||||
"file": "pdfs/issue8702.pdf",
|
||||
"md5": "59d501ed1518d78ef6ee442cf824b0f6",
|
||||
"rounds": 1,
|
||||
"link": false,
|
||||
"lastPage": 1,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "issue8702-text",
|
||||
"file": "pdfs/issue8702.pdf",
|
||||
"md5": "59d501ed1518d78ef6ee442cf824b0f6",
|
||||
"rounds": 1,
|
||||
"link": false,
|
||||
"lastPage": 1,
|
||||
"type": "text"
|
||||
},
|
||||
{ "id": "pr4897",
|
||||
"file": "pdfs/pr4897.pdf",
|
||||
"md5": "26897633eea5e6d10345a130b1c1777c",
|
||||
|
@ -14,9 +14,9 @@
|
||||
*/
|
||||
|
||||
import { Dict, Name } from '../../src/core/primitives';
|
||||
import { FormatError, OPS } from '../../src/shared/util';
|
||||
import { OperatorList, PartialEvaluator } from '../../src/core/evaluator';
|
||||
import { Stream, StringStream } from '../../src/core/stream';
|
||||
import { OPS } from '../../src/shared/util';
|
||||
import { WorkerTask } from '../../src/core/worker';
|
||||
import { XRefMock } from './test_utils';
|
||||
|
||||
@ -48,6 +48,8 @@ describe('evaluator', function() {
|
||||
operatorList: result,
|
||||
}).then(function() {
|
||||
callback(result);
|
||||
}, function(reason) {
|
||||
callback(reason);
|
||||
});
|
||||
}
|
||||
|
||||
@ -229,9 +231,9 @@ describe('evaluator', function() {
|
||||
it('should skip paintXObject if name is missing', function(done) {
|
||||
var stream = new StringStream('/ Do');
|
||||
runOperatorListCheck(partialEvaluator, stream, new ResourcesMock(),
|
||||
function (result) {
|
||||
expect(result.argsArray).toEqual([]);
|
||||
expect(result.fnArray).toEqual([]);
|
||||
function(result) {
|
||||
expect(result instanceof FormatError).toEqual(true);
|
||||
expect(result.message).toEqual('XObject must be referred to by name.');
|
||||
done();
|
||||
});
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user