pdf.js/src/core/document.js

672 lines
22 KiB
JavaScript
Raw Normal View History

2012-09-01 07:48:21 +09:00
/* Copyright 2012 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
2011-10-25 10:13:12 +09:00
import {
assert, FormatError, getInheritableProperty, info, isArrayBuffer, isBool,
isNum, isSpace, isString, MissingDataException, OPS, shadow, stringToBytes,
Check that the first page can be successfully loaded, to try and ascertain the validity of the XRef table (issue 7496, issue 10326) For PDF documents with sufficiently broken XRef tables, it's usually quite obvious when you need to fallback to indexing the entire file. However, for certain kinds of corrupted PDF documents the XRef table will, for all intents and purposes, appear to be valid. It's not until you actually try to fetch various objects that things will start to break, which is the case in the referenced issues[1]. Since there's generally a real effort being in made PDF.js to load even corrupt PDF documents, this patch contains a suggested approach to attempt to do a bit more validation of the XRef table during the initial document loading phase. Here the choice is made to attempt to load the *first* page, as a basic sanity check of the validity of the XRef table. Please note that attempting to load a more-or-less arbitrarily chosen object without any context of what it's supposed to be isn't a very useful, which is why this particular choice was made. Obviously, just because the first page can be loaded successfully that doesn't guarantee that the *entire* XRef table is valid, however if even the first page fails to load you can be reasonably sure that the document is *not* valid[2]. Even though this patch won't cause any significant increase in the amount of parsing required during initial loading of the document[3], it will require loading of more data upfront which thus delays the initial `getDocument` call. Whether or not this is a problem depends very much on what you actually measure, please consider the following examples: ```javascript console.time('first'); getDocument(...).promise.then((pdfDocument) => { console.timeEnd('first'); }); console.time('second'); getDocument(...).promise.then((pdfDocument) => { pdfDocument.getPage(1).then((pdfPage) => { // Note: the API uses `pageNumber >= 1`, the Worker uses `pageIndex >= 0`. console.timeEnd('second'); }); }); ``` The first case is pretty much guaranteed to show a small regression, however the second case won't be affected at all since the Worker caches the result of `getPage` calls. Again, please remember that the second case is what matters for the standard PDF.js use-case which is why I'm hoping that this patch is deemed acceptable. --- [1] In issue 7496, the problem is that the document is edited without the XRef table being correctly updated. In issue 10326, the generator was sorting the XRef table according to the offsets rather than the objects. [2] The idea of checking the first page in particular came from the "standard" use-case for the PDF.js library, i.e. the default viewer, where a failure to load the first page basically means that nothing will work; note how `{BaseViewer, PDFThumbnailViewer}.setDocument` depends completely on being able to fetch the *first* page. [3] The only extra parsing is caused by, potentially, having to traverse *part* of the `Pages` tree to find the first page.
2018-12-05 05:51:27 +09:00
stringToPDFString, Util, warn, XRefEntryException, XRefParseException
} from '../shared/util';
import { Catalog, ObjectLoader, XRef } from './obj';
import { Dict, isDict, isName, isStream, Ref } from './primitives';
import { NullStream, Stream, StreamsSequenceStream } from './stream';
import { AnnotationFactory } from './annotation';
import { calculateMD5 } from './crypto';
import { Linearization } from './parser';
import { OperatorList } from './operator_list';
import { PartialEvaluator } from './evaluator';
import { PDFFunctionFactory } from './function';
const DEFAULT_USER_UNIT = 1.0;
const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792];
function isAnnotationRenderable(annotation, intent) {
return (intent === 'display' && annotation.viewable) ||
(intent === 'print' && annotation.printable);
}
2014-03-14 22:39:35 +09:00
class Page {
constructor({ pdfManager, xref, pageIndex, pageDict, ref, fontCache,
builtInCMapCache, pdfFunctionFactory, }) {
this.pdfManager = pdfManager;
this.pageIndex = pageIndex;
2011-10-25 10:13:12 +09:00
this.pageDict = pageDict;
this.xref = xref;
this.ref = ref;
this.fontCache = fontCache;
this.builtInCMapCache = builtInCMapCache;
this.pdfFunctionFactory = pdfFunctionFactory;
this.evaluatorOptions = pdfManager.evaluatorOptions;
this.resourcesPromise = null;
const uniquePrefix = `p${this.pageIndex}_`;
const idCounters = {
obj: 0,
};
this.idFactory = {
createObjId() {
return uniquePrefix + (++idCounters.obj);
},
};
2011-10-25 10:13:12 +09:00
}
/**
* @private
*/
_getInheritableProperty(key, getArray = false) {
const value = getInheritableProperty({ dict: this.pageDict, key, getArray,
stopWhenFound: false, });
if (!Array.isArray(value)) {
return value;
}
if (value.length === 1 || !isDict(value[0])) {
return value[0];
}
return Dict.merge(this.xref, value);
}
get content() {
return this.pageDict.get('Contents');
}
get resources() {
// For robustness: The spec states that a \Resources entry has to be
// present, but can be empty. Some documents still omit it; in this case
// we return an empty dictionary.
return shadow(this, 'resources',
this._getInheritableProperty('Resources') || Dict.empty);
}
get mediaBox() {
const mediaBox = this._getInheritableProperty('MediaBox',
/* getArray = */ true);
// Reset invalid media box to letter size.
if (!Array.isArray(mediaBox) || mediaBox.length !== 4) {
return shadow(this, 'mediaBox', LETTER_SIZE_MEDIABOX);
}
return shadow(this, 'mediaBox', mediaBox);
}
get cropBox() {
const cropBox = this._getInheritableProperty('CropBox',
/* getArray = */ true);
// Reset invalid crop box to media box.
if (!Array.isArray(cropBox) || cropBox.length !== 4) {
return shadow(this, 'cropBox', this.mediaBox);
}
return shadow(this, 'cropBox', cropBox);
}
2014-03-14 22:39:35 +09:00
get userUnit() {
let obj = this.pageDict.get('UserUnit');
if (!isNum(obj) || obj <= 0) {
obj = DEFAULT_USER_UNIT;
}
return shadow(this, 'userUnit', obj);
}
get view() {
// From the spec, 6th ed., p.963:
// "The crop, bleed, trim, and art boxes should not ordinarily
// extend beyond the boundaries of the media box. If they do, they are
// effectively reduced to their intersection with the media box."
const mediaBox = this.mediaBox, cropBox = this.cropBox;
if (mediaBox === cropBox) {
return shadow(this, 'view', mediaBox);
}
const intersection = Util.intersect(cropBox, mediaBox);
return shadow(this, 'view', intersection || mediaBox);
}
get rotate() {
let rotate = this._getInheritableProperty('Rotate') || 0;
// Normalize rotation so it's a multiple of 90 and between 0 and 270.
if (rotate % 90 !== 0) {
rotate = 0;
} else if (rotate >= 360) {
rotate = rotate % 360;
} else if (rotate < 0) {
// The spec doesn't cover negatives. Assume it's counterclockwise
// rotation. The following is the other implementation of modulo.
rotate = ((rotate % 360) + 360) % 360;
}
return shadow(this, 'rotate', rotate);
}
getContentStream() {
const content = this.content;
let stream;
if (Array.isArray(content)) {
// Fetching the individual streams from the array.
const xref = this.xref;
const streams = [];
for (const stream of content) {
streams.push(xref.fetchIfRef(stream));
2011-10-25 10:13:12 +09:00
}
stream = new StreamsSequenceStream(streams);
} else if (isStream(content)) {
stream = content;
} else {
// Replace non-existent page content with empty content.
stream = new NullStream();
}
return stream;
}
loadResources(keys) {
if (!this.resourcesPromise) {
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise = this.pdfManager.ensure(this, 'resources');
}
return this.resourcesPromise.then(() => {
const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
return objectLoader.load();
});
}
getOperatorList({ handler, task, intent, renderInteractiveForms, }) {
const contentStreamPromise = this.pdfManager.ensure(this,
'getContentStream');
const resourcesPromise = this.loadResources([
'ExtGState',
'ColorSpace',
'Pattern',
'Shading',
'XObject',
'Font',
]);
const partialEvaluator = new PartialEvaluator({
pdfManager: this.pdfManager,
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this.idFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
options: this.evaluatorOptions,
pdfFunctionFactory: this.pdfFunctionFactory,
});
const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
const pageListPromise = dataPromises.then(([contentStream]) => {
const opList = new OperatorList(intent, handler, this.pageIndex);
handler.send('StartRenderPage', {
transparency: partialEvaluator.hasBlendModes(this.resources),
pageIndex: this.pageIndex,
intent,
});
return partialEvaluator.getOperatorList({
stream: contentStream,
task,
resources: this.resources,
operatorList: opList,
}).then(function() {
return opList;
});
});
// Fetch the page's annotations and add their operator lists to the
// page's operator list to render them.
return Promise.all([pageListPromise, this._parsedAnnotations]).then(
function([pageOpList, annotations]) {
if (annotations.length === 0) {
pageOpList.flush(true);
return pageOpList;
2011-10-25 10:13:12 +09:00
}
2014-03-14 22:39:35 +09:00
// Collect the operator list promises for the annotations. Each promise
// is resolved with the complete operator list for a single annotation.
const opListPromises = [];
for (const annotation of annotations) {
if (isAnnotationRenderable(annotation, intent)) {
opListPromises.push(annotation.getOperatorList(
partialEvaluator, task, renderInteractiveForms));
}
}
return Promise.all(opListPromises).then(function(opLists) {
pageOpList.addOp(OPS.beginAnnotations, []);
for (const opList of opLists) {
pageOpList.addOpList(opList);
}
pageOpList.addOp(OPS.endAnnotations, []);
pageOpList.flush(true);
return pageOpList;
});
});
}
2014-03-14 22:39:35 +09:00
extractTextContent({ handler, task, normalizeWhitespace, sink,
combineTextItems, }) {
const contentStreamPromise = this.pdfManager.ensure(this,
Change the signatures of the `PartialEvaluator` "constructor" and its `getOperatorList`/`getTextContent` methods to take parameter objects Currently these methods accept a large number of parameters, which creates quite unwieldy call-sites. When invoking them, you have to remember not only what arguments to supply, but also the correct order, to avoid runtime errors. Furthermore, since some of the parameters are optional, you also have to remember to pass e.g. `null` or `undefined` for those ones. Also, adding new parameters to these methods (which happens occasionally), often becomes unnecessarily tedious (based on personal experience). Please note that I do *not* think that we need/should convert *every* single method in `evaluator.js` (or elsewhere in `/core` files) to take parameter objects. However, in my opinion, once a method starts relying on approximately five parameter (or even more), passing them in individually becomes quite cumbersome. With these changes, I obviously needed to update the `evaluator_spec.js` unit-tests. The main change there, except the new method signatures[1], is that it's now re-using *one* `PartialEvalutor` instance, since I couldn't see any compelling reason for creating a new one in every single test. *Note:* If this patch is accepted, my intention is to (time permitting) see if it makes sense to convert additional methods in `evaluator.js` (and other `/core` files) in a similar fashion, but I figured that it'd be a good idea to limit the initial scope somewhat. --- [1] A fun fact here, note how the `PartialEvaluator` signature used in `evaluator_spec.js` wasn't even correct in the current `master`.
2017-04-30 06:13:51 +09:00
'getContentStream');
const resourcesPromise = this.loadResources([
'ExtGState',
'XObject',
'Font',
]);
const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
return dataPromises.then(([contentStream]) => {
const partialEvaluator = new PartialEvaluator({
Change the signatures of the `PartialEvaluator` "constructor" and its `getOperatorList`/`getTextContent` methods to take parameter objects Currently these methods accept a large number of parameters, which creates quite unwieldy call-sites. When invoking them, you have to remember not only what arguments to supply, but also the correct order, to avoid runtime errors. Furthermore, since some of the parameters are optional, you also have to remember to pass e.g. `null` or `undefined` for those ones. Also, adding new parameters to these methods (which happens occasionally), often becomes unnecessarily tedious (based on personal experience). Please note that I do *not* think that we need/should convert *every* single method in `evaluator.js` (or elsewhere in `/core` files) to take parameter objects. However, in my opinion, once a method starts relying on approximately five parameter (or even more), passing them in individually becomes quite cumbersome. With these changes, I obviously needed to update the `evaluator_spec.js` unit-tests. The main change there, except the new method signatures[1], is that it's now re-using *one* `PartialEvalutor` instance, since I couldn't see any compelling reason for creating a new one in every single test. *Note:* If this patch is accepted, my intention is to (time permitting) see if it makes sense to convert additional methods in `evaluator.js` (and other `/core` files) in a similar fashion, but I figured that it'd be a good idea to limit the initial scope somewhat. --- [1] A fun fact here, note how the `PartialEvaluator` signature used in `evaluator_spec.js` wasn't even correct in the current `master`.
2017-04-30 06:13:51 +09:00
pdfManager: this.pdfManager,
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this.idFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
options: this.evaluatorOptions,
pdfFunctionFactory: this.pdfFunctionFactory,
Change the signatures of the `PartialEvaluator` "constructor" and its `getOperatorList`/`getTextContent` methods to take parameter objects Currently these methods accept a large number of parameters, which creates quite unwieldy call-sites. When invoking them, you have to remember not only what arguments to supply, but also the correct order, to avoid runtime errors. Furthermore, since some of the parameters are optional, you also have to remember to pass e.g. `null` or `undefined` for those ones. Also, adding new parameters to these methods (which happens occasionally), often becomes unnecessarily tedious (based on personal experience). Please note that I do *not* think that we need/should convert *every* single method in `evaluator.js` (or elsewhere in `/core` files) to take parameter objects. However, in my opinion, once a method starts relying on approximately five parameter (or even more), passing them in individually becomes quite cumbersome. With these changes, I obviously needed to update the `evaluator_spec.js` unit-tests. The main change there, except the new method signatures[1], is that it's now re-using *one* `PartialEvalutor` instance, since I couldn't see any compelling reason for creating a new one in every single test. *Note:* If this patch is accepted, my intention is to (time permitting) see if it makes sense to convert additional methods in `evaluator.js` (and other `/core` files) in a similar fashion, but I figured that it'd be a good idea to limit the initial scope somewhat. --- [1] A fun fact here, note how the `PartialEvaluator` signature used in `evaluator_spec.js` wasn't even correct in the current `master`.
2017-04-30 06:13:51 +09:00
});
return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
normalizeWhitespace,
combineTextItems,
sink,
});
});
}
getAnnotationsData(intent) {
return this._parsedAnnotations.then(function(annotations) {
const annotationsData = [];
for (let i = 0, ii = annotations.length; i < ii; i++) {
if (!intent || isAnnotationRenderable(annotations[i], intent)) {
annotationsData.push(annotations[i].data);
}
}
return annotationsData;
});
}
get annotations() {
return shadow(this, 'annotations',
this._getInheritableProperty('Annots') || []);
}
Change the signatures of the `PartialEvaluator` "constructor" and its `getOperatorList`/`getTextContent` methods to take parameter objects Currently these methods accept a large number of parameters, which creates quite unwieldy call-sites. When invoking them, you have to remember not only what arguments to supply, but also the correct order, to avoid runtime errors. Furthermore, since some of the parameters are optional, you also have to remember to pass e.g. `null` or `undefined` for those ones. Also, adding new parameters to these methods (which happens occasionally), often becomes unnecessarily tedious (based on personal experience). Please note that I do *not* think that we need/should convert *every* single method in `evaluator.js` (or elsewhere in `/core` files) to take parameter objects. However, in my opinion, once a method starts relying on approximately five parameter (or even more), passing them in individually becomes quite cumbersome. With these changes, I obviously needed to update the `evaluator_spec.js` unit-tests. The main change there, except the new method signatures[1], is that it's now re-using *one* `PartialEvalutor` instance, since I couldn't see any compelling reason for creating a new one in every single test. *Note:* If this patch is accepted, my intention is to (time permitting) see if it makes sense to convert additional methods in `evaluator.js` (and other `/core` files) in a similar fashion, but I figured that it'd be a good idea to limit the initial scope somewhat. --- [1] A fun fact here, note how the `PartialEvaluator` signature used in `evaluator_spec.js` wasn't even correct in the current `master`.
2017-04-30 06:13:51 +09:00
get _parsedAnnotations() {
const parsedAnnotations =
this.pdfManager.ensure(this, 'annotations').then(() => {
const annotationRefs = this.annotations;
const annotationPromises = [];
for (let i = 0, ii = annotationRefs.length; i < ii; i++) {
annotationPromises.push(AnnotationFactory.create(
this.xref, annotationRefs[i], this.pdfManager, this.idFactory));
}
return Promise.all(annotationPromises).then(function(annotations) {
return annotations.filter(function isDefined(annotation) {
return !!annotation;
});
}, function(reason) {
warn(`_parsedAnnotations: "${reason}".`);
return [];
});
});
return shadow(this, '_parsedAnnotations', parsedAnnotations);
}
}
2011-10-25 10:13:12 +09:00
/**
2012-04-13 04:11:22 +09:00
* The `PDFDocument` holds all the data of the PDF file. Compared to the
2011-10-25 10:13:12 +09:00
* `PDFDoc`, this one doesn't have any job management code.
2012-04-13 04:11:22 +09:00
* Right now there exists one PDFDocument on the main thread + one object
2011-10-25 10:13:12 +09:00
* for each worker. If there is no worker support enabled, there are two
2012-04-13 04:11:22 +09:00
* `PDFDocument` objects on the main thread created.
2011-10-25 10:13:12 +09:00
*/
2012-04-13 04:11:22 +09:00
var PDFDocument = (function PDFDocumentClosure() {
var FINGERPRINT_FIRST_BYTES = 1024;
var EMPTY_FINGERPRINT = '\x00\x00\x00\x00\x00\x00\x00' +
'\x00\x00\x00\x00\x00\x00\x00\x00\x00';
function PDFDocument(pdfManager, arg) {
var stream;
2014-03-23 04:36:35 +09:00
if (isStream(arg)) {
stream = arg;
2014-03-23 04:36:35 +09:00
} else if (isArrayBuffer(arg)) {
stream = new Stream(arg);
2014-03-23 04:36:35 +09:00
} else {
2017-06-29 05:51:31 +09:00
throw new Error('PDFDocument: Unknown argument type');
2014-03-23 04:36:35 +09:00
}
if (stream.length <= 0) {
throw new Error('PDFDocument: stream must have data');
}
this.pdfManager = pdfManager;
2011-10-25 10:13:12 +09:00
this.stream = stream;
this.xref = new XRef(stream, pdfManager);
let evaluatorOptions = pdfManager.evaluatorOptions;
this.pdfFunctionFactory = new PDFFunctionFactory({
xref: this.xref,
isEvalSupported: evaluatorOptions.isEvalSupported,
});
this._pagePromises = [];
2011-10-25 10:13:12 +09:00
}
function find(stream, needle, limit, backwards) {
var pos = stream.pos;
var end = stream.end;
var strBuf = [];
2014-03-23 04:36:35 +09:00
if (pos + limit > end) {
2011-10-25 10:13:12 +09:00
limit = end - pos;
2014-03-23 04:36:35 +09:00
}
for (var n = 0; n < limit; ++n) {
strBuf.push(String.fromCharCode(stream.getByte()));
}
var str = strBuf.join('');
2011-10-25 10:13:12 +09:00
stream.pos = pos;
var index = backwards ? str.lastIndexOf(needle) : str.indexOf(needle);
if (index === -1) {
2011-10-25 10:13:12 +09:00
return false; /* not found */
2014-03-23 04:36:35 +09:00
}
2011-10-25 10:13:12 +09:00
stream.pos += index;
return true; /* found */
}
const DocumentInfoValidators = {
Title: isString,
Author: isString,
Subject: isString,
Keywords: isString,
Creator: isString,
Producer: isString,
CreationDate: isString,
ModDate: isString,
Trapped: isName,
2012-08-04 08:11:43 +09:00
};
2012-04-13 04:11:22 +09:00
PDFDocument.prototype = {
2013-02-07 08:19:29 +09:00
parse: function PDFDocument_parse(recoveryMode) {
this.setup(recoveryMode);
var version = this.catalog.catDict.get('Version');
if (isName(version)) {
this.pdfFormatVersion = version.name;
}
2013-08-16 23:53:05 +09:00
try {
// checking if AcroForm is present
this.acroForm = this.catalog.catDict.get('AcroForm');
if (this.acroForm) {
this.xfa = this.acroForm.get('XFA');
var fields = this.acroForm.get('Fields');
if ((!fields || !Array.isArray(fields) || fields.length === 0) &&
2013-08-16 23:53:05 +09:00
!this.xfa) {
// no fields and no XFA -- not a form (?)
this.acroForm = null;
}
}
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
2013-08-16 23:53:05 +09:00
info('Something wrong with AcroForm entry');
this.acroForm = null;
}
2013-02-07 08:19:29 +09:00
},
2011-10-25 10:13:12 +09:00
get linearization() {
let linearization = null;
try {
linearization = Linearization.create(this.stream);
} catch (err) {
if (err instanceof MissingDataException) {
throw err;
}
info(err);
2011-10-25 10:13:12 +09:00
}
// shadow the prototype getter with a data property
return shadow(this, 'linearization', linearization);
},
get startXRef() {
var stream = this.stream;
var startXRef = 0;
var linearization = this.linearization;
if (linearization) {
// Find end of first obj.
stream.reset();
2014-03-23 04:36:35 +09:00
if (find(stream, 'endobj', 1024)) {
2011-10-25 10:13:12 +09:00
startXRef = stream.pos + 6;
2014-03-23 04:36:35 +09:00
}
2011-10-25 10:13:12 +09:00
} else {
2011-12-05 07:00:22 +09:00
// Find startxref by jumping backward from the end of the file.
var step = 1024;
var found = false, pos = stream.end;
2011-12-05 07:00:22 +09:00
while (!found && pos > 0) {
pos -= step - 'startxref'.length;
2014-03-23 04:36:35 +09:00
if (pos < 0) {
pos = 0;
2014-03-23 04:36:35 +09:00
}
stream.pos = pos;
2011-12-05 07:00:22 +09:00
found = find(stream, 'startxref', step, true);
}
if (found) {
2011-10-25 10:13:12 +09:00
stream.skip(9);
var ch;
do {
2013-07-01 05:45:15 +09:00
ch = stream.getByte();
} while (isSpace(ch));
2011-10-25 10:13:12 +09:00
var str = '';
2013-07-01 05:45:15 +09:00
while (ch >= 0x20 && ch <= 0x39) { // < '9'
str += String.fromCharCode(ch);
ch = stream.getByte();
2011-10-25 10:13:12 +09:00
}
startXRef = parseInt(str, 10);
2014-03-23 04:36:35 +09:00
if (isNaN(startXRef)) {
2011-10-25 10:13:12 +09:00
startXRef = 0;
2014-03-23 04:36:35 +09:00
}
2011-10-25 10:13:12 +09:00
}
}
// shadow the prototype getter with a data property
return shadow(this, 'startXRef', startXRef);
},
2011-10-25 10:13:12 +09:00
// Find the header, remove leading garbage and setup the stream
// starting from the header.
2012-04-13 04:11:22 +09:00
checkHeader: function PDFDocument_checkHeader() {
2011-10-25 10:13:12 +09:00
var stream = this.stream;
stream.reset();
if (find(stream, '%PDF-', 1024)) {
// Found the header, trim off any garbage before it.
stream.moveStart();
2012-11-06 02:12:17 +09:00
// Reading file format version
var MAX_VERSION_LENGTH = 12;
var version = '', ch;
2013-07-01 05:45:15 +09:00
while ((ch = stream.getByte()) > 0x20) { // SPACE
2012-11-06 02:12:17 +09:00
if (version.length >= MAX_VERSION_LENGTH) {
break;
}
2013-07-01 05:45:15 +09:00
version += String.fromCharCode(ch);
2012-11-06 02:12:17 +09:00
}
if (!this.pdfFormatVersion) {
// removing "%PDF-"-prefix
this.pdfFormatVersion = version.substring(5);
}
2011-10-25 10:13:12 +09:00
return;
}
// May not be a PDF file, continue anyway.
},
2013-02-07 08:19:29 +09:00
parseStartXRef: function PDFDocument_parseStartXRef() {
var startXRef = this.startXRef;
this.xref.setStartXRef(startXRef);
},
setup: function PDFDocument_setup(recoveryMode) {
this.xref.parse(recoveryMode);
this.catalog = new Catalog(this.pdfManager, this.xref);
2011-10-25 10:13:12 +09:00
},
get numPages() {
var linearization = this.linearization;
var num = linearization ? linearization.numPages : this.catalog.numPages;
// shadow the prototype getter
return shadow(this, 'numPages', num);
},
2013-02-07 08:19:29 +09:00
get documentInfo() {
const docInfo = {
2013-02-01 06:46:44 +09:00
PDFFormatVersion: this.pdfFormatVersion,
IsLinearized: !!this.linearization,
2013-08-16 23:53:05 +09:00
IsAcroFormPresent: !!this.acroForm,
Fix inconsistent spacing and trailing commas in objects in `src/core/` files, so we can enable the `comma-dangle` and `object-curly-spacing` ESLint rules later on *Unfortunately this patch is fairly big, even though it only covers the `src/core` folder, but splitting it even further seemed difficult.* http://eslint.org/docs/rules/comma-dangle http://eslint.org/docs/rules/object-curly-spacing Given that we currently have quite inconsistent object formatting, fixing this in *one* big patch probably wouldn't be feasible (since I cannot imagine anyone wanting to review that); hence I've opted to try and do this piecewise instead. Please note: This patch was created automatically, using the ESLint --fix command line option. In a couple of places this caused lines to become too long, and I've fixed those manually; please refer to the interdiff below for the only hand-edits in this patch. ```diff diff --git a/src/core/evaluator.js b/src/core/evaluator.js index abab9027..dcd3594b 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2785,7 +2785,8 @@ var EvaluatorPreprocessor = (function EvaluatorPreprocessorClosure() { t['Tz'] = { id: OPS.setHScale, numArgs: 1, variableArgs: false, }; t['TL'] = { id: OPS.setLeading, numArgs: 1, variableArgs: false, }; t['Tf'] = { id: OPS.setFont, numArgs: 2, variableArgs: false, }; - t['Tr'] = { id: OPS.setTextRenderingMode, numArgs: 1, variableArgs: false, }; + t['Tr'] = { id: OPS.setTextRenderingMode, numArgs: 1, + variableArgs: false, }; t['Ts'] = { id: OPS.setTextRise, numArgs: 1, variableArgs: false, }; t['Td'] = { id: OPS.moveText, numArgs: 2, variableArgs: false, }; t['TD'] = { id: OPS.setLeadingMoveText, numArgs: 2, variableArgs: false, }; diff --git a/src/core/jbig2.js b/src/core/jbig2.js index 5a17d482..71671541 100644 --- a/src/core/jbig2.js +++ b/src/core/jbig2.js @@ -123,19 +123,22 @@ var Jbig2Image = (function Jbig2ImageClosure() { { x: -1, y: -1, }, { x: 0, y: -1, }, { x: 1, y: -1, }, { x: -2, y: 0, }, { x: -1, y: 0, }], [{ x: -3, y: -1, }, { x: -2, y: -1, }, { x: -1, y: -1, }, { x: 0, y: -1, }, - { x: 1, y: -1, }, { x: -4, y: 0, }, { x: -3, y: 0, }, { x: -2, y: 0, }, { x: -1, y: 0, }] + { x: 1, y: -1, }, { x: -4, y: 0, }, { x: -3, y: 0, }, { x: -2, y: 0, }, + { x: -1, y: 0, }] ]; var RefinementTemplates = [ { coding: [{ x: 0, y: -1, }, { x: 1, y: -1, }, { x: -1, y: 0, }], - reference: [{ x: 0, y: -1, }, { x: 1, y: -1, }, { x: -1, y: 0, }, { x: 0, y: 0, }, - { x: 1, y: 0, }, { x: -1, y: 1, }, { x: 0, y: 1, }, { x: 1, y: 1, }], + reference: [{ x: 0, y: -1, }, { x: 1, y: -1, }, { x: -1, y: 0, }, + { x: 0, y: 0, }, { x: 1, y: 0, }, { x: -1, y: 1, }, + { x: 0, y: 1, }, { x: 1, y: 1, }], }, { - coding: [{ x: -1, y: -1, }, { x: 0, y: -1, }, { x: 1, y: -1, }, { x: -1, y: 0, }], - reference: [{ x: 0, y: -1, }, { x: -1, y: 0, }, { x: 0, y: 0, }, { x: 1, y: 0, }, - { x: 0, y: 1, }, { x: 1, y: 1, }], + coding: [{ x: -1, y: -1, }, { x: 0, y: -1, }, { x: 1, y: -1, }, + { x: -1, y: 0, }], + reference: [{ x: 0, y: -1, }, { x: -1, y: 0, }, { x: 0, y: 0, }, + { x: 1, y: 0, }, { x: 0, y: 1, }, { x: 1, y: 1, }], } ]; ```
2017-06-02 18:16:24 +09:00
IsXFAPresent: !!this.xfa,
2012-12-01 08:36:39 +09:00
};
let infoDict;
try {
infoDict = this.xref.trailer.get('Info');
} catch (err) {
if (err instanceof MissingDataException) {
throw err;
}
info('The document information dictionary is invalid.');
}
if (isDict(infoDict)) {
// Fill the document info with valid entries from the specification,
// as well as any existing well-formed custom entries.
for (let key of infoDict.getKeys()) {
const value = infoDict.get(key);
if (DocumentInfoValidators[key]) {
// Make sure the (standard) value conforms to the specification.
if (DocumentInfoValidators[key](value)) {
2014-03-23 04:36:35 +09:00
docInfo[key] = (typeof value !== 'string' ?
value : stringToPDFString(value));
2012-08-04 08:11:43 +09:00
} else {
info(`Bad value in document info for "${key}".`);
}
} else if (typeof key === 'string') {
// For custom values, only accept white-listed types to prevent
// errors that would occur when trying to send non-serializable
// objects to the main-thread (for example `Dict` or `Stream`).
let customValue;
if (isString(value)) {
customValue = stringToPDFString(value);
} else if (isName(value) || isNum(value) || isBool(value)) {
customValue = value;
} else {
info(`Unsupported value in document info for (custom) "${key}".`);
continue;
}
if (!docInfo['Custom']) {
docInfo['Custom'] = Object.create(null);
2012-08-04 08:11:43 +09:00
}
docInfo['Custom'][key] = customValue;
2012-08-04 08:11:43 +09:00
}
}
}
2013-02-07 08:19:29 +09:00
return shadow(this, 'documentInfo', docInfo);
},
2013-02-07 08:19:29 +09:00
get fingerprint() {
var xref = this.xref, hash, fileID = '';
var idArray = xref.trailer.get('ID');
2013-10-03 17:09:06 +09:00
if (Array.isArray(idArray) && idArray[0] && isString(idArray[0]) &&
idArray[0] !== EMPTY_FINGERPRINT) {
hash = stringToBytes(idArray[0]);
} else {
if (this.stream.ensureRange) {
this.stream.ensureRange(0,
Math.min(FINGERPRINT_FIRST_BYTES, this.stream.end));
}
hash = calculateMD5(this.stream.bytes.subarray(0,
FINGERPRINT_FIRST_BYTES), 0, FINGERPRINT_FIRST_BYTES);
2013-10-03 17:09:06 +09:00
}
for (var i = 0, n = hash.length; i < n; i++) {
var hex = hash[i].toString(16);
fileID += hex.length === 1 ? '0' + hex : hex;
}
2012-03-27 07:14:59 +09:00
2013-02-07 08:19:29 +09:00
return shadow(this, 'fingerprint', fileID);
},
2013-02-07 08:19:29 +09:00
_getLinearizationPage(pageIndex) {
const { catalog, linearization, } = this;
assert(linearization && linearization.pageFirst === pageIndex);
const ref = new Ref(linearization.objectNumberFirst, 0);
return this.xref.fetchAsync(ref).then((obj) => {
// Ensure that the object that was found is actually a Page dictionary.
if (isDict(obj, 'Page') ||
(isDict(obj) && !obj.has('Type') && obj.has('Contents'))) {
if (ref && !catalog.pageKidsCountCache.has(ref)) {
catalog.pageKidsCountCache.put(ref, 1); // Cache the Page reference.
}
return [obj, ref];
}
throw new FormatError('The Linearization dictionary doesn\'t point ' +
'to a valid Page dictionary.');
}).catch((reason) => {
info(reason);
return catalog.getPageDict(pageIndex);
});
},
getPage(pageIndex) {
if (this._pagePromises[pageIndex] !== undefined) {
return this._pagePromises[pageIndex];
}
const { catalog, linearization, } = this;
const promise = (linearization && linearization.pageFirst === pageIndex) ?
this._getLinearizationPage(pageIndex) : catalog.getPageDict(pageIndex);
return this._pagePromises[pageIndex] = promise.then(([pageDict, ref]) => {
return new Page({
pdfManager: this.pdfManager,
xref: this.xref,
pageIndex,
pageDict,
ref,
fontCache: catalog.fontCache,
builtInCMapCache: catalog.builtInCMapCache,
pdfFunctionFactory: this.pdfFunctionFactory,
});
});
},
Check that the first page can be successfully loaded, to try and ascertain the validity of the XRef table (issue 7496, issue 10326) For PDF documents with sufficiently broken XRef tables, it's usually quite obvious when you need to fallback to indexing the entire file. However, for certain kinds of corrupted PDF documents the XRef table will, for all intents and purposes, appear to be valid. It's not until you actually try to fetch various objects that things will start to break, which is the case in the referenced issues[1]. Since there's generally a real effort being in made PDF.js to load even corrupt PDF documents, this patch contains a suggested approach to attempt to do a bit more validation of the XRef table during the initial document loading phase. Here the choice is made to attempt to load the *first* page, as a basic sanity check of the validity of the XRef table. Please note that attempting to load a more-or-less arbitrarily chosen object without any context of what it's supposed to be isn't a very useful, which is why this particular choice was made. Obviously, just because the first page can be loaded successfully that doesn't guarantee that the *entire* XRef table is valid, however if even the first page fails to load you can be reasonably sure that the document is *not* valid[2]. Even though this patch won't cause any significant increase in the amount of parsing required during initial loading of the document[3], it will require loading of more data upfront which thus delays the initial `getDocument` call. Whether or not this is a problem depends very much on what you actually measure, please consider the following examples: ```javascript console.time('first'); getDocument(...).promise.then((pdfDocument) => { console.timeEnd('first'); }); console.time('second'); getDocument(...).promise.then((pdfDocument) => { pdfDocument.getPage(1).then((pdfPage) => { // Note: the API uses `pageNumber >= 1`, the Worker uses `pageIndex >= 0`. console.timeEnd('second'); }); }); ``` The first case is pretty much guaranteed to show a small regression, however the second case won't be affected at all since the Worker caches the result of `getPage` calls. Again, please remember that the second case is what matters for the standard PDF.js use-case which is why I'm hoping that this patch is deemed acceptable. --- [1] In issue 7496, the problem is that the document is edited without the XRef table being correctly updated. In issue 10326, the generator was sorting the XRef table according to the offsets rather than the objects. [2] The idea of checking the first page in particular came from the "standard" use-case for the PDF.js library, i.e. the default viewer, where a failure to load the first page basically means that nothing will work; note how `{BaseViewer, PDFThumbnailViewer}.setDocument` depends completely on being able to fetch the *first* page. [3] The only extra parsing is caused by, potentially, having to traverse *part* of the `Pages` tree to find the first page.
2018-12-05 05:51:27 +09:00
checkFirstPage() {
return this.getPage(0).catch((reason) => {
if (reason instanceof XRefEntryException) {
// Clear out the various caches to ensure that we haven't stored any
// inconsistent and/or incorrect state, since that could easily break
// subsequent `this.getPage` calls.
this._pagePromises.length = 0;
this.cleanup();
throw new XRefParseException();
}
});
},
cleanup: function PDFDocument_cleanup() {
return this.catalog.cleanup();
Fix inconsistent spacing and trailing commas in objects in `src/core/` files, so we can enable the `comma-dangle` and `object-curly-spacing` ESLint rules later on *Unfortunately this patch is fairly big, even though it only covers the `src/core` folder, but splitting it even further seemed difficult.* http://eslint.org/docs/rules/comma-dangle http://eslint.org/docs/rules/object-curly-spacing Given that we currently have quite inconsistent object formatting, fixing this in *one* big patch probably wouldn't be feasible (since I cannot imagine anyone wanting to review that); hence I've opted to try and do this piecewise instead. Please note: This patch was created automatically, using the ESLint --fix command line option. In a couple of places this caused lines to become too long, and I've fixed those manually; please refer to the interdiff below for the only hand-edits in this patch. ```diff diff --git a/src/core/evaluator.js b/src/core/evaluator.js index abab9027..dcd3594b 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2785,7 +2785,8 @@ var EvaluatorPreprocessor = (function EvaluatorPreprocessorClosure() { t['Tz'] = { id: OPS.setHScale, numArgs: 1, variableArgs: false, }; t['TL'] = { id: OPS.setLeading, numArgs: 1, variableArgs: false, }; t['Tf'] = { id: OPS.setFont, numArgs: 2, variableArgs: false, }; - t['Tr'] = { id: OPS.setTextRenderingMode, numArgs: 1, variableArgs: false, }; + t['Tr'] = { id: OPS.setTextRenderingMode, numArgs: 1, + variableArgs: false, }; t['Ts'] = { id: OPS.setTextRise, numArgs: 1, variableArgs: false, }; t['Td'] = { id: OPS.moveText, numArgs: 2, variableArgs: false, }; t['TD'] = { id: OPS.setLeadingMoveText, numArgs: 2, variableArgs: false, }; diff --git a/src/core/jbig2.js b/src/core/jbig2.js index 5a17d482..71671541 100644 --- a/src/core/jbig2.js +++ b/src/core/jbig2.js @@ -123,19 +123,22 @@ var Jbig2Image = (function Jbig2ImageClosure() { { x: -1, y: -1, }, { x: 0, y: -1, }, { x: 1, y: -1, }, { x: -2, y: 0, }, { x: -1, y: 0, }], [{ x: -3, y: -1, }, { x: -2, y: -1, }, { x: -1, y: -1, }, { x: 0, y: -1, }, - { x: 1, y: -1, }, { x: -4, y: 0, }, { x: -3, y: 0, }, { x: -2, y: 0, }, { x: -1, y: 0, }] + { x: 1, y: -1, }, { x: -4, y: 0, }, { x: -3, y: 0, }, { x: -2, y: 0, }, + { x: -1, y: 0, }] ]; var RefinementTemplates = [ { coding: [{ x: 0, y: -1, }, { x: 1, y: -1, }, { x: -1, y: 0, }], - reference: [{ x: 0, y: -1, }, { x: 1, y: -1, }, { x: -1, y: 0, }, { x: 0, y: 0, }, - { x: 1, y: 0, }, { x: -1, y: 1, }, { x: 0, y: 1, }, { x: 1, y: 1, }], + reference: [{ x: 0, y: -1, }, { x: 1, y: -1, }, { x: -1, y: 0, }, + { x: 0, y: 0, }, { x: 1, y: 0, }, { x: -1, y: 1, }, + { x: 0, y: 1, }, { x: 1, y: 1, }], }, { - coding: [{ x: -1, y: -1, }, { x: 0, y: -1, }, { x: 1, y: -1, }, { x: -1, y: 0, }], - reference: [{ x: 0, y: -1, }, { x: -1, y: 0, }, { x: 0, y: 0, }, { x: 1, y: 0, }, - { x: 0, y: 1, }, { x: 1, y: 1, }], + coding: [{ x: -1, y: -1, }, { x: 0, y: -1, }, { x: 1, y: -1, }, + { x: -1, y: 0, }], + reference: [{ x: 0, y: -1, }, { x: -1, y: 0, }, { x: 0, y: 0, }, + { x: 1, y: 0, }, { x: 0, y: 1, }, { x: 1, y: 1, }], } ]; ```
2017-06-02 18:16:24 +09:00
},
2011-10-25 10:13:12 +09:00
};
2012-04-13 04:11:22 +09:00
return PDFDocument;
2011-10-25 10:13:12 +09:00
})();
export {
Page,
PDFDocument,
};