[api-minor] Always allow e.g. rendering to continue even if there are errors, and add a stopAtErrors parameter to getDocument to opt-out of this behaviour (issue 6342, issue 3795, bug 1130815)

Other PDF readers, e.g. Adobe Reader and PDFium (in Chrome), will attempt to render as much of a page as possible even if there are errors present.
Currently we just bail as soon the first error is hit, which means that we'll usually not render anything in these cases and just display a blank page instead.

NOTE: This patch changes the default behaviour of the PDF.js API to always attempt to recover as much data as possible, even when encountering errors during e.g. `getOperatorList`/`getTextContent`, which thus improve our handling of corrupt PDF files and allow the default viewer to handle errors slightly more gracefully.
In the event that an API consumer wishes to use the old behaviour, where we stop parsing as soon as an error is encountered, the `stopAtErrors` parameter can be set at `getDocument`.

Fixes, inasmuch it's possible since the PDF files are corrupt, e.g. issue 6342, issue 3795, and [bug 1130815](https://bugzilla.mozilla.org/show_bug.cgi?id=1130815) (and probably others too).
This commit is contained in:
Jonas Jenwald 2017-02-19 14:03:08 +01:00
parent 10e5f766a2
commit a39d636eb8
8 changed files with 255 additions and 50 deletions

View File

@ -454,16 +454,15 @@ var Annotation = (function AnnotationClosure() {
var self = this;
return resourcesPromise.then(function(resources) {
var opList = new OperatorList();
opList.addOp(OPS.beginAnnotation, [data.rect, transform, matrix]);
return evaluator.getOperatorList(self.appearance, task,
resources, opList).
then(function () {
opList.addOp(OPS.endAnnotation, []);
self.appearance.reset();
return opList;
});
var opList = new OperatorList();
opList.addOp(OPS.beginAnnotation, [data.rect, transform, matrix]);
return evaluator.getOperatorList(self.appearance, task,
resources, opList).then(function () {
opList.addOp(OPS.endAnnotation, []);
self.appearance.reset();
return opList;
});
});
}
};
@ -758,10 +757,9 @@ var TextWidgetAnnotation = (function TextWidgetAnnotationClosure() {
var stream = new Stream(stringToBytes(this.data.defaultAppearance));
return evaluator.getOperatorList(stream, task, this.fieldResources,
operatorList).
then(function () {
return operatorList;
});
operatorList).then(function () {
return operatorList;
});
}
});

View File

@ -114,6 +114,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
maxImageSize: -1,
disableFontFace: false,
disableNativeImageDecoder: false,
ignoreErrors: false,
};
function NativeImageDecoder(xref, resources, handler, forceDataSchema) {
@ -342,9 +343,10 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
operatorList,
task,
initialState) {
var matrix = xobj.dict.getArray('Matrix');
var bbox = xobj.dict.getArray('BBox');
var group = xobj.dict.get('Group');
var dict = xobj.dict;
var matrix = dict.getArray('Matrix');
var bbox = dict.getArray('BBox');
var group = dict.get('Group');
if (group) {
var groupOptions = {
matrix: matrix,
@ -374,8 +376,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
operatorList.addOp(OPS.paintFormXObjectBegin, [matrix, bbox]);
return this.getOperatorList(xobj, task,
(xobj.dict.get('Resources') || resources), operatorList, initialState).
then(function () {
(dict.get('Resources') || resources),
operatorList, initialState).then(function () {
operatorList.addOp(OPS.paintFormXObjectEnd, []);
if (group) {
@ -522,7 +524,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
return this.buildFormXObject(resources, smaskContent, smaskOptions,
operatorList, task, stateManager.state.clone());
operatorList, task,
stateManager.state.clone());
},
handleTilingType:
@ -538,14 +541,14 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
return this.getOperatorList(pattern, task, patternResources,
tilingOpList).then(function () {
// Add the dependencies to the parent operator list so they are
// resolved before sub operator list is executed synchronously.
operatorList.addDependencies(tilingOpList.dependencies);
operatorList.addOp(fn, getTilingPatternIR({
fnArray: tilingOpList.fnArray,
argsArray: tilingOpList.argsArray
}, patternDict, args));
});
// Add the dependencies to the parent operator list so they are
// resolved before sub operator list is executed synchronously.
operatorList.addDependencies(tilingOpList.dependencies);
operatorList.addOp(fn, getTilingPatternIR({
fnArray: tilingOpList.fnArray,
argsArray: tilingOpList.argsArray
}, patternDict, args));
});
},
handleSetFont:
@ -899,7 +902,6 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
resources,
operatorList,
initialState) {
var self = this;
var xref = this.xref;
var imageCache = Object.create(null);
@ -913,6 +915,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager);
var timeSlotManager = new TimeSlotManager();
function closePendingRestoreOPS(argument) {
for (var i = 0, ii = preprocessor.savedStatesDepth; i < ii; i++) {
operatorList.addOp(OPS.restore, []);
}
}
return new Promise(function promiseBody(resolve, reject) {
var next = function (promise) {
promise.then(function () {
@ -1187,11 +1195,21 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
// Some PDFs don't close all restores inside object/form.
// Closing those for them.
for (i = 0, ii = preprocessor.savedStatesDepth; i < ii; i++) {
operatorList.addOp(OPS.restore, []);
}
closePendingRestoreOPS();
resolve();
});
}).catch(function(reason) {
if (this.options.ignoreErrors) {
// Error(s) in the OperatorList -- sending unsupported feature
// notification and allow rendering to continue.
this.handler.send('UnsupportedFeature',
{ featureId: UNSUPPORTED_FEATURES.unknown });
warn('getOperatorList - ignoring errors during task: ' + task.name);
closePendingRestoreOPS();
return;
}
throw reason;
}.bind(this));
},
getTextContent:
@ -1660,19 +1678,24 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
break;
}
stateManager.save();
// Use a new `StateManager` to prevent incorrect positioning of
// textItems *after* the Form XObject, since errors in the data
// can otherwise prevent `restore` operators from being executed.
// NOTE: This is only an issue when `options.ignoreErrors = true`.
var currentState = stateManager.state.clone();
var xObjStateManager = new StateManager(currentState);
var matrix = xobj.dict.getArray('Matrix');
if (isArray(matrix) && matrix.length === 6) {
stateManager.transform(matrix);
xObjStateManager.transform(matrix);
}
next(self.getTextContent(xobj, task,
xobj.dict.get('Resources') || resources, stateManager,
xobj.dict.get('Resources') || resources, xObjStateManager,
normalizeWhitespace, combineTextItems).then(
function (formTextContent) {
Util.appendToArray(textContent.items, formTextContent.items);
Util.extendObj(textContent.styles, formTextContent.styles);
stateManager.restore();
xobjsCache.key = name;
xobjsCache.texts = formTextContent;
@ -1706,7 +1729,16 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
flushTextContentItem();
resolve(textContent);
});
}).catch(function(reason) {
if (this.options.ignoreErrors) {
// Error(s) in the TextContent -- allow text-extraction to continue.
warn('getTextContent - ignoring errors during task: ' + task.name);
flushTextContentItem();
return textContent;
}
throw reason;
}.bind(this));
},
extractDataStructures:

View File

@ -732,6 +732,7 @@ var WorkerMessageHandler = {
maxImageSize: data.maxImageSize === undefined ? -1 : data.maxImageSize,
disableFontFace: data.disableFontFace,
disableNativeImageDecoder: data.disableNativeImageDecoder,
ignoreErrors: data.ignoreErrors,
};
getPdfManager(data, evaluatorOptions).then(function (newPdfManager) {
@ -899,15 +900,14 @@ var WorkerMessageHandler = {
handler.on('GetTextContent', function wphExtractText(data) {
var pageIndex = data.pageIndex;
var normalizeWhitespace = data.normalizeWhitespace;
var combineTextItems = data.combineTextItems;
return pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task);
var pageNum = pageIndex + 1;
var start = Date.now();
return page.extractTextContent(handler, task, normalizeWhitespace,
combineTextItems).then(
return page.extractTextContent(handler, task, data.normalizeWhitespace,
data.combineTextItems).then(
function(textContent) {
finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' +

View File

@ -148,6 +148,10 @@ if (typeof PDFJSDev !== 'undefined' &&
* used when reading built-in CMap files. Providing a custom factory is useful
* for environments without `XMLHttpRequest` support, such as e.g. Node.js.
* The default value is {DOMCMapReaderFactory}.
* @property {boolean} stopAtErrors - (optional) Reject certain promises, e.g.
* `getOperatorList`, `getTextContent`, and `RenderTask`, when the associated
* PDF data cannot be successfully parsed, instead of attempting to recover
* whatever possible of the data. The default value is `false`.
*/
/**
@ -262,6 +266,7 @@ function getDocument(src, pdfDataRangeTransport,
params.rangeChunkSize = params.rangeChunkSize || DEFAULT_RANGE_CHUNK_SIZE;
params.disableNativeImageDecoder = params.disableNativeImageDecoder === true;
params.ignoreErrors = params.stopAtErrors !== true;
var CMapReaderFactory = params.CMapReaderFactory || DOMCMapReaderFactory;
if (!worker) {
@ -325,6 +330,7 @@ function _fetchDocument(worker, source, pdfDataRangeTransport, docId) {
!isPostMessageTransfersDisabled,
docBaseUrl: source.docBaseUrl,
disableNativeImageDecoder: source.disableNativeImageDecoder,
ignoreErrors: source.ignoreErrors,
}).then(function (workerId) {
if (worker.destroyed) {
throw new Error('Worker was destroyed');
@ -826,8 +832,6 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
this.pendingCleanup = false;
var renderingIntent = (params.intent === 'print' ? 'print' : 'display');
var renderInteractiveForms = (params.renderInteractiveForms === true ?
true : /* Default */ false);
var canvasFactory = params.canvasFactory || new DOMCanvasFactory();
if (!this.intentStates[renderingIntent]) {
@ -850,7 +854,7 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
this.transport.messageHandler.send('RenderPageRequest', {
pageIndex: this.pageNumber - 1,
intent: renderingIntent,
renderInteractiveForms: renderInteractiveForms,
renderInteractiveForms: (params.renderInteractiveForms === true),
});
}
@ -914,7 +918,7 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
/**
* @return {Promise} A promise resolved with an {@link PDFOperatorList}
* object that represents page's operator list.
* object that represents page's operator list.
*/
getOperatorList: function PDFPageProxy_getOperatorList() {
function operatorListChanged() {
@ -950,7 +954,7 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
this.transport.messageHandler.send('RenderPageRequest', {
pageIndex: this.pageIndex,
intent: renderingIntent
intent: renderingIntent,
});
}
return intentState.opListReadCapability.promise;
@ -962,12 +966,11 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
* object that represent the page text content.
*/
getTextContent: function PDFPageProxy_getTextContent(params) {
params = params || {};
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
pageIndex: this.pageNumber - 1,
normalizeWhitespace: (params && params.normalizeWhitespace === true ?
true : /* Default */ false),
combineTextItems: (params && params.disableCombineTextItems === true ?
false : /* Default */ true),
normalizeWhitespace: (params.normalizeWhitespace === true),
combineTextItems: (params.disableCombineTextItems !== true),
});
},

View File

@ -21,6 +21,7 @@
!issue5874.pdf
!issue5808.pdf
!issue6204.pdf
!issue6342.pdf
!issue6652.pdf
!issue6782.pdf
!issue6901.pdf

View File

@ -0,0 +1 @@
https://bug1130815.bmoattachments.org/attachment.cgi?id=8560958

142
test/pdfs/issue6342.pdf Normal file
View File

@ -0,0 +1,142 @@
%PDF-1.7
%âãÏÓ
1 0 obj
<<
/Kids [2 0 R]
/Count 1
/Type /Pages
>>
endobj
2 0 obj
<<
/Group 3 0 R
/Parent 1 0 R
/Resources 4 0 R
/MediaBox [0 0 300 100]
/Type /Page
/Contents 5 0 R
>>
endobj
3 0 obj
<<
/CS /DeviceRGB
/Type /Group
/S /Transparency
>>
endobj
4 0 obj
<<
/Font
<<
/F1 6 0 R
>>
/XObject
<<
/Im1 7 0 R
>>
>>
endobj
5 0 obj
<<
/Length 193
>>
stream
q
1 0 0 1 10 80 cm
0 0 0 rg 0 0 0 RG
1 w
0 0 m
280 0 l S
Q
q
1 0 0 1 25 45 cm
/Im1 Do
1 0 0 1 100 0 cm
/Im1 Do
Q
q
1 0 0 1 10 20 cm
BT
/F1 18 Tf
(Issue 6342 - Form XObject with errors) Tj
ET
Q
endstream
endobj
7 0 obj
<<
/Group 3 0 R
/Subtype /Form
/Length 1050
/Resources
<<
/ExtGState
<<
/a0
<<
/ca 1
/CA 1
>>
>>
>>
/FormType 1
/BBox [0 0 45 25]
/Type /XObject
>>
stream
q
0.2 0.8 0.2 rg /a0 gs
13.117 22.651 m 11.281 22.651 9.809 21.163 9.809 19.327 c 9.809 18.733
9.961 18.174 10.234 17.69 c 11.34 18.315 12.621 18.678 13.98 18.678 c
14.113 18.678 14.238 18.674 14.367 18.666 c 14.352 18.85 14.344 19.038
14.344 19.229 c 14.344 20.252 14.566 21.225 14.957 22.1 c 14.43 22.455
13.801 22.651 13.117 22.651 c h
13.117 22.651 m f
6.383 12.92 m 2.859 12.92 0 10.084 0 6.561 c 0 3.034 2.859 0.174 6.383
0.174 c 7.727 0.174 8.969 0.592 9.996 1.299 c 9.57 1.959 9.32 2.748
9.32 3.584 .020.594 6. c 499 c08. c830.174 586.17 21.17436
8.4 6.17436 9 c030.1717436 9 c 18.6.418.85930784 07.859 05.1717c08.
859 09.6.442.859 12m f
6.383 12651 m f
6.383 12.9f
678 13.757 5.651727563.757 5.7.0 2.8 c858.7.0 2.8 c030.177.0 2.09 8
8.136.1778899 797 5.521265172496.17873.8 c90674 c95.65153174 c95.c.455 c 7.4 c95.651918.7.770.252105.7.74522.1 c047 61 18.67802 61623.67802 748069.229 c02 1 m 13.9 c719.651202 15c90678 c809.3215c195.654 18.6746 3.
117768.674469..75728.229 c 09..7578.6741452.757 5.678 13.757 5.651 c h 13.757 5.65f
Q
endstream
endobj
6 0 obj
<<
/BaseFont /Times-Roman
/Subtype /Type1
/Encoding /WinAnsiEncoding
/Type /Font
>>
endobj
8 0 obj
<<
/Pages 1 0 R
/Type /Catalog
>>
endobj xref
0 9
0000000000 65535 f
0000000015 00000 n
0000000074 00000 n
0000000193 00000 n
0000000261 00000 n
0000000334 00000 n
0000001818 00000 n
0000000581 00000 n
0000001919 00000 n
trailer
<<
/Root 8 0 R
/Size 9
>>
startxref
1969
%%EOF

View File

@ -1500,6 +1500,20 @@
"lastPage": 1,
"type": "load"
},
{ "id": "bug1130815-eq",
"file": "pdfs/bug1130815.pdf",
"md5": "3ff3b550c3af766991b2a1b11d00de85",
"rounds": 1,
"link": true,
"type": "eq"
},
{ "id": "bug1130815-text",
"file": "pdfs/bug1130815.pdf",
"md5": "3ff3b550c3af766991b2a1b11d00de85",
"rounds": 1,
"link": true,
"type": "text"
},
{ "id": "issue3248",
"file": "pdfs/issue3248.pdf",
"md5": "970767ed68de46c316d74de67965999b",
@ -1532,6 +1546,20 @@
"lastPage": 1,
"type": "load"
},
{ "id": "issue6342-eq",
"file": "pdfs/issue6342.pdf",
"md5": "2ea85ca8d17117798f105be88bdb2bfd",
"rounds": 1,
"link": false,
"type": "eq"
},
{ "id": "issue6342-text",
"file": "pdfs/issue6342.pdf",
"md5": "2ea85ca8d17117798f105be88bdb2bfd",
"rounds": 1,
"link": false,
"type": "text"
},
{ "id": "issue7020",
"file": "pdfs/issue7020.pdf",
"md5": "93b464e21c649e64ae92eeafe99fc31b",