pdf.js/src/core/core.js

546 lines
18 KiB
JavaScript
Raw Normal View History

2012-09-01 07:48:21 +09:00
/* Copyright 2012 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* globals warn, Dict, isDict, shadow, isArray, Util, StreamsSequenceStream,
isStream, NullStream, ObjectLoader, PartialEvaluator, Promise,
OperatorList, Annotation, error, assert, XRef, isArrayBuffer, Stream,
isString, isName, info, Linearization, MissingDataException, Lexer,
Catalog, stringToPDFString, stringToBytes, calculateMD5,
AnnotationFactory */
2011-10-25 10:13:12 +09:00
2011-10-26 10:18:22 +09:00
'use strict';
2011-12-09 07:18:43 +09:00
var Page = (function PageClosure() {
2014-03-14 22:39:35 +09:00
var LETTER_SIZE_MEDIABOX = [0, 0, 612, 792];
function Page(pdfManager, xref, pageIndex, pageDict, ref, fontCache) {
this.pdfManager = pdfManager;
this.pageIndex = pageIndex;
2011-10-25 10:13:12 +09:00
this.pageDict = pageDict;
this.xref = xref;
this.ref = ref;
this.fontCache = fontCache;
this.idCounters = {
obj: 0
};
this.resourcesPromise = null;
2011-10-25 10:13:12 +09:00
}
2011-12-09 07:18:43 +09:00
Page.prototype = {
getPageProp: function Page_getPageProp(key) {
return this.pageDict.get(key);
2011-10-25 10:13:12 +09:00
},
2014-03-14 22:39:35 +09:00
getInheritedPageProp: function Page_getInheritedPageProp(key) {
var dict = this.pageDict, valueArray = null, loopCount = 0;
var MAX_LOOP_COUNT = 100;
// Always walk up the entire parent chain, to be able to find
// e.g. \Resources placed on multiple levels of the tree.
while (dict) {
var value = dict.get(key);
if (value) {
if (!valueArray) {
valueArray = [];
}
valueArray.push(value);
}
if (++loopCount > MAX_LOOP_COUNT) {
warn('Page_getInheritedPageProp: maximum loop count exceeded.');
2011-10-25 10:13:12 +09:00
break;
2014-03-14 22:39:35 +09:00
}
dict = dict.get('Parent');
2011-10-25 10:13:12 +09:00
}
if (!valueArray) {
return Dict.empty;
}
if (valueArray.length === 1 || !isDict(valueArray[0]) ||
loopCount > MAX_LOOP_COUNT) {
return valueArray[0];
}
return Dict.merge(this.xref, valueArray);
2011-10-25 10:13:12 +09:00
},
2014-03-14 22:39:35 +09:00
2011-10-25 10:13:12 +09:00
get content() {
2013-04-20 05:07:08 +09:00
return this.getPageProp('Contents');
2011-10-25 10:13:12 +09:00
},
2014-03-14 22:39:35 +09:00
2011-10-25 10:13:12 +09:00
get resources() {
// For robustness: The spec states that a \Resources entry has to be
// present, but can be empty. Some document omit it still, in this case
// we return an empty dictionary.
return shadow(this, 'resources', this.getInheritedPageProp('Resources'));
2011-10-25 10:13:12 +09:00
},
2014-03-14 22:39:35 +09:00
2011-10-25 10:13:12 +09:00
get mediaBox() {
2014-03-14 22:39:35 +09:00
var obj = this.getInheritedPageProp('MediaBox');
2011-10-25 10:13:12 +09:00
// Reset invalid media box to letter size.
2014-03-14 22:39:35 +09:00
if (!isArray(obj) || obj.length !== 4) {
obj = LETTER_SIZE_MEDIABOX;
}
2011-10-25 10:13:12 +09:00
return shadow(this, 'mediaBox', obj);
},
2014-03-14 22:39:35 +09:00
2011-10-25 10:13:12 +09:00
get view() {
var mediaBox = this.mediaBox;
2014-03-14 22:39:35 +09:00
var cropBox = this.getInheritedPageProp('CropBox');
if (!isArray(cropBox) || cropBox.length !== 4) {
return shadow(this, 'view', mediaBox);
2014-03-14 22:39:35 +09:00
}
2012-02-15 04:48:58 +09:00
// From the spec, 6th ed., p.963:
// "The crop, bleed, trim, and art boxes should not ordinarily
// extend beyond the boundaries of the media box. If they do, they are
// effectively reduced to their intersection with the media box."
cropBox = Util.intersect(cropBox, mediaBox);
2014-03-14 22:39:35 +09:00
if (!cropBox) {
return shadow(this, 'view', mediaBox);
2014-03-14 22:39:35 +09:00
}
return shadow(this, 'view', cropBox);
2011-10-25 10:13:12 +09:00
},
2014-03-14 22:39:35 +09:00
2011-10-25 10:13:12 +09:00
get rotate() {
2014-03-14 22:39:35 +09:00
var rotate = this.getInheritedPageProp('Rotate') || 0;
2011-10-25 10:13:12 +09:00
// Normalize rotation so it's a multiple of 90 and between 0 and 270
if (rotate % 90 !== 0) {
2011-10-25 10:13:12 +09:00
rotate = 0;
} else if (rotate >= 360) {
rotate = rotate % 360;
} else if (rotate < 0) {
// The spec doesn't cover negatives, assume its counterclockwise
// rotation. The following is the other implementation of modulo.
rotate = ((rotate % 360) + 360) % 360;
}
return shadow(this, 'rotate', rotate);
},
2014-03-14 22:39:35 +09:00
getContentStream: function Page_getContentStream() {
var content = this.content;
2013-04-20 05:07:08 +09:00
var stream;
2011-10-25 10:13:12 +09:00
if (isArray(content)) {
// fetching items
var xref = this.xref;
2011-10-25 10:13:12 +09:00
var i, n = content.length;
2011-12-11 08:24:54 +09:00
var streams = [];
2014-03-23 04:36:35 +09:00
for (i = 0; i < n; ++i) {
streams.push(xref.fetchIfRef(content[i]));
2014-03-23 04:36:35 +09:00
}
2013-04-20 05:07:08 +09:00
stream = new StreamsSequenceStream(streams);
} else if (isStream(content)) {
2013-04-20 05:07:08 +09:00
stream = content;
} else {
// replacing non-existent page content with empty one
2013-04-20 05:07:08 +09:00
stream = new NullStream();
2011-10-25 10:13:12 +09:00
}
2013-04-20 05:07:08 +09:00
return stream;
},
2014-03-14 22:39:35 +09:00
loadResources: function Page_loadResources(keys) {
if (!this.resourcesPromise) {
2014-03-14 22:39:35 +09:00
// TODO: add async getInheritedPageProp and remove this.
this.resourcesPromise = this.pdfManager.ensure(this, 'resources');
}
return this.resourcesPromise.then(function resourceSuccess() {
var objectLoader = new ObjectLoader(this.resources.map,
keys,
this.xref);
return objectLoader.load();
}.bind(this));
},
2014-03-14 22:39:35 +09:00
2015-10-21 10:50:32 +09:00
getOperatorList: function Page_getOperatorList(handler, task, intent) {
var self = this;
var pdfManager = this.pdfManager;
var contentStreamPromise = pdfManager.ensure(this, 'getContentStream',
[]);
var resourcesPromise = this.loadResources([
'ExtGState',
'ColorSpace',
'Pattern',
'Shading',
'XObject',
2014-03-14 22:39:35 +09:00
'Font'
// ProcSet
// Properties
]);
2014-03-23 04:36:35 +09:00
var partialEvaluator = new PartialEvaluator(pdfManager, this.xref,
handler, this.pageIndex,
'p' + this.pageIndex + '_',
this.idCounters,
this.fontCache);
2014-05-10 10:21:15 +09:00
var dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
var pageListPromise = dataPromises.then(function(data) {
var contentStream = data[0];
var opList = new OperatorList(intent, handler, self.pageIndex);
handler.send('StartRenderPage', {
transparency: partialEvaluator.hasBlendModes(self.resources),
pageIndex: self.pageIndex,
intent: intent
});
2015-10-21 10:50:32 +09:00
return partialEvaluator.getOperatorList(contentStream, task,
self.resources, opList).then(function () {
2014-05-10 10:21:15 +09:00
return opList;
});
});
2013-03-21 17:04:44 +09:00
var annotationsPromise = pdfManager.ensure(this, 'annotations');
2014-05-10 10:21:15 +09:00
return Promise.all([pageListPromise, annotationsPromise]).then(
function(datas) {
var pageOpList = datas[0];
2013-03-21 17:04:44 +09:00
var annotations = datas[1];
if (annotations.length === 0) {
pageOpList.flush(true);
2014-05-10 10:21:15 +09:00
return pageOpList;
}
var annotationsReadyPromise = Annotation.appendToOperatorList(
2015-10-21 10:50:32 +09:00
annotations, pageOpList, pdfManager, partialEvaluator, task, intent);
2014-05-10 10:21:15 +09:00
return annotationsReadyPromise.then(function () {
pageOpList.flush(true);
2014-05-10 10:21:15 +09:00
return pageOpList;
});
});
2011-10-25 10:13:12 +09:00
},
2014-03-14 22:39:35 +09:00
2015-10-21 10:50:32 +09:00
extractTextContent: function Page_extractTextContent(task) {
2011-12-11 08:24:54 +09:00
var handler = {
2011-12-15 12:42:06 +09:00
on: function nullHandlerOn() {},
send: function nullHandlerSend() {}
2011-12-11 08:24:54 +09:00
};
var self = this;
var pdfManager = this.pdfManager;
var contentStreamPromise = pdfManager.ensure(this, 'getContentStream',
[]);
var resourcesPromise = this.loadResources([
'ExtGState',
'XObject',
'Font'
]);
2013-04-19 02:41:33 +09:00
var dataPromises = Promise.all([contentStreamPromise,
resourcesPromise]);
return dataPromises.then(function(data) {
var contentStream = data[0];
2014-03-23 04:36:35 +09:00
var partialEvaluator = new PartialEvaluator(pdfManager, self.xref,
handler, self.pageIndex,
'p' + self.pageIndex + '_',
self.idCounters,
self.fontCache);
return partialEvaluator.getTextContent(contentStream,
2015-10-21 10:50:32 +09:00
task,
self.resources);
});
2011-10-25 10:13:12 +09:00
},
2013-03-21 17:04:44 +09:00
getAnnotationsData: function Page_getAnnotationsData() {
var annotations = this.annotations;
var annotationsData = [];
for (var i = 0, n = annotations.length; i < n; ++i) {
2015-07-19 00:52:03 +09:00
annotationsData.push(annotations[i].data);
}
2013-03-21 17:04:44 +09:00
return annotationsData;
},
2013-03-21 17:04:44 +09:00
get annotations() {
var annotations = [];
2015-07-19 00:52:03 +09:00
var annotationRefs = this.getInheritedPageProp('Annots') || [];
var annotationFactory = new AnnotationFactory();
2013-03-21 17:04:44 +09:00
for (var i = 0, n = annotationRefs.length; i < n; ++i) {
var annotationRef = annotationRefs[i];
var annotation = annotationFactory.create(this.xref, annotationRef);
if (annotation &&
(annotation.isViewable() || annotation.isPrintable())) {
2013-03-21 17:04:44 +09:00
annotations.push(annotation);
}
2011-10-25 10:13:12 +09:00
}
2013-03-21 17:04:44 +09:00
return shadow(this, 'annotations', annotations);
2011-10-25 10:13:12 +09:00
}
};
2011-12-09 07:18:43 +09:00
return Page;
2011-10-25 10:13:12 +09:00
})();
/**
2012-04-13 04:11:22 +09:00
* The `PDFDocument` holds all the data of the PDF file. Compared to the
2011-10-25 10:13:12 +09:00
* `PDFDoc`, this one doesn't have any job management code.
2012-04-13 04:11:22 +09:00
* Right now there exists one PDFDocument on the main thread + one object
2011-10-25 10:13:12 +09:00
* for each worker. If there is no worker support enabled, there are two
2012-04-13 04:11:22 +09:00
* `PDFDocument` objects on the main thread created.
2011-10-25 10:13:12 +09:00
*/
2012-04-13 04:11:22 +09:00
var PDFDocument = (function PDFDocumentClosure() {
var FINGERPRINT_FIRST_BYTES = 1024;
var EMPTY_FINGERPRINT = '\x00\x00\x00\x00\x00\x00\x00' +
'\x00\x00\x00\x00\x00\x00\x00\x00\x00';
function PDFDocument(pdfManager, arg, password) {
2014-03-23 04:36:35 +09:00
if (isStream(arg)) {
init.call(this, pdfManager, arg, password);
2014-03-23 04:36:35 +09:00
} else if (isArrayBuffer(arg)) {
init.call(this, pdfManager, new Stream(arg), password);
2014-03-23 04:36:35 +09:00
} else {
2012-04-13 04:11:22 +09:00
error('PDFDocument: Unknown argument type');
2014-03-23 04:36:35 +09:00
}
2011-10-25 10:13:12 +09:00
}
function init(pdfManager, stream, password) {
assert(stream.length > 0, 'stream must have data');
this.pdfManager = pdfManager;
2011-10-25 10:13:12 +09:00
this.stream = stream;
var xref = new XRef(this.stream, password, pdfManager);
2013-02-07 08:19:29 +09:00
this.xref = xref;
2011-10-25 10:13:12 +09:00
}
function find(stream, needle, limit, backwards) {
var pos = stream.pos;
var end = stream.end;
var strBuf = [];
2014-03-23 04:36:35 +09:00
if (pos + limit > end) {
2011-10-25 10:13:12 +09:00
limit = end - pos;
2014-03-23 04:36:35 +09:00
}
for (var n = 0; n < limit; ++n) {
strBuf.push(String.fromCharCode(stream.getByte()));
}
var str = strBuf.join('');
2011-10-25 10:13:12 +09:00
stream.pos = pos;
var index = backwards ? str.lastIndexOf(needle) : str.indexOf(needle);
if (index === -1) {
2011-10-25 10:13:12 +09:00
return false; /* not found */
2014-03-23 04:36:35 +09:00
}
2011-10-25 10:13:12 +09:00
stream.pos += index;
return true; /* found */
}
2012-08-07 06:32:54 +09:00
var DocumentInfoValidators = {
2012-08-04 08:11:43 +09:00
get entries() {
// Lazily build this since all the validation functions below are not
// defined until after this file loads.
return shadow(this, 'entries', {
Title: isString,
Author: isString,
Subject: isString,
Keywords: isString,
Creator: isString,
Producer: isString,
CreationDate: isString,
ModDate: isString,
Trapped: isName
});
}
};
2012-04-13 04:11:22 +09:00
PDFDocument.prototype = {
2013-02-07 08:19:29 +09:00
parse: function PDFDocument_parse(recoveryMode) {
this.setup(recoveryMode);
var version = this.catalog.catDict.get('Version');
if (isName(version)) {
this.pdfFormatVersion = version.name;
}
2013-08-16 23:53:05 +09:00
try {
// checking if AcroForm is present
this.acroForm = this.catalog.catDict.get('AcroForm');
if (this.acroForm) {
this.xfa = this.acroForm.get('XFA');
var fields = this.acroForm.get('Fields');
if ((!fields || !isArray(fields) || fields.length === 0) &&
!this.xfa) {
// no fields and no XFA -- not a form (?)
this.acroForm = null;
}
}
} catch (ex) {
info('Something wrong with AcroForm entry');
this.acroForm = null;
}
2013-02-07 08:19:29 +09:00
},
2011-10-25 10:13:12 +09:00
get linearization() {
var linearization = null;
if (this.stream.length) {
2012-07-14 00:00:55 +09:00
try {
linearization = Linearization.create(this.stream);
2012-07-14 00:00:55 +09:00
} catch (err) {
2013-02-07 08:19:29 +09:00
if (err instanceof MissingDataException) {
throw err;
}
info(err);
}
2011-10-25 10:13:12 +09:00
}
// shadow the prototype getter with a data property
return shadow(this, 'linearization', linearization);
},
get startXRef() {
var stream = this.stream;
var startXRef = 0;
var linearization = this.linearization;
if (linearization) {
// Find end of first obj.
stream.reset();
2014-03-23 04:36:35 +09:00
if (find(stream, 'endobj', 1024)) {
2011-10-25 10:13:12 +09:00
startXRef = stream.pos + 6;
2014-03-23 04:36:35 +09:00
}
2011-10-25 10:13:12 +09:00
} else {
2011-12-05 07:00:22 +09:00
// Find startxref by jumping backward from the end of the file.
var step = 1024;
var found = false, pos = stream.end;
2011-12-05 07:00:22 +09:00
while (!found && pos > 0) {
pos -= step - 'startxref'.length;
2014-03-23 04:36:35 +09:00
if (pos < 0) {
pos = 0;
2014-03-23 04:36:35 +09:00
}
stream.pos = pos;
2011-12-05 07:00:22 +09:00
found = find(stream, 'startxref', step, true);
}
if (found) {
2011-10-25 10:13:12 +09:00
stream.skip(9);
var ch;
do {
2013-07-01 05:45:15 +09:00
ch = stream.getByte();
2011-10-25 10:13:12 +09:00
} while (Lexer.isSpace(ch));
var str = '';
2013-07-01 05:45:15 +09:00
while (ch >= 0x20 && ch <= 0x39) { // < '9'
str += String.fromCharCode(ch);
ch = stream.getByte();
2011-10-25 10:13:12 +09:00
}
startXRef = parseInt(str, 10);
2014-03-23 04:36:35 +09:00
if (isNaN(startXRef)) {
2011-10-25 10:13:12 +09:00
startXRef = 0;
2014-03-23 04:36:35 +09:00
}
2011-10-25 10:13:12 +09:00
}
}
// shadow the prototype getter with a data property
return shadow(this, 'startXRef', startXRef);
},
get mainXRefEntriesOffset() {
var mainXRefEntriesOffset = 0;
var linearization = this.linearization;
2014-03-23 04:36:35 +09:00
if (linearization) {
2011-10-25 10:13:12 +09:00
mainXRefEntriesOffset = linearization.mainXRefEntriesOffset;
2014-03-23 04:36:35 +09:00
}
2011-10-25 10:13:12 +09:00
// shadow the prototype getter with a data property
return shadow(this, 'mainXRefEntriesOffset', mainXRefEntriesOffset);
},
// Find the header, remove leading garbage and setup the stream
// starting from the header.
2012-04-13 04:11:22 +09:00
checkHeader: function PDFDocument_checkHeader() {
2011-10-25 10:13:12 +09:00
var stream = this.stream;
stream.reset();
if (find(stream, '%PDF-', 1024)) {
// Found the header, trim off any garbage before it.
stream.moveStart();
2012-11-06 02:12:17 +09:00
// Reading file format version
var MAX_VERSION_LENGTH = 12;
var version = '', ch;
2013-07-01 05:45:15 +09:00
while ((ch = stream.getByte()) > 0x20) { // SPACE
2012-11-06 02:12:17 +09:00
if (version.length >= MAX_VERSION_LENGTH) {
break;
}
2013-07-01 05:45:15 +09:00
version += String.fromCharCode(ch);
2012-11-06 02:12:17 +09:00
}
if (!this.pdfFormatVersion) {
// removing "%PDF-"-prefix
this.pdfFormatVersion = version.substring(5);
}
2011-10-25 10:13:12 +09:00
return;
}
// May not be a PDF file, continue anyway.
},
2013-02-07 08:19:29 +09:00
parseStartXRef: function PDFDocument_parseStartXRef() {
var startXRef = this.startXRef;
this.xref.setStartXRef(startXRef);
},
setup: function PDFDocument_setup(recoveryMode) {
this.xref.parse(recoveryMode);
this.catalog = new Catalog(this.pdfManager, this.xref);
2011-10-25 10:13:12 +09:00
},
get numPages() {
var linearization = this.linearization;
var num = linearization ? linearization.numPages : this.catalog.numPages;
// shadow the prototype getter
return shadow(this, 'numPages', num);
},
2013-02-07 08:19:29 +09:00
get documentInfo() {
2012-12-01 08:36:39 +09:00
var docInfo = {
2013-02-01 06:46:44 +09:00
PDFFormatVersion: this.pdfFormatVersion,
2013-08-16 23:53:05 +09:00
IsAcroFormPresent: !!this.acroForm,
IsXFAPresent: !!this.xfa
2012-12-01 08:36:39 +09:00
};
var infoDict;
try {
infoDict = this.xref.trailer.get('Info');
} catch (err) {
info('The document information dictionary is invalid.');
}
if (infoDict) {
2012-08-07 06:32:54 +09:00
var validEntries = DocumentInfoValidators.entries;
2012-08-04 08:11:43 +09:00
// Only fill the document info with valid entries from the spec.
for (var key in validEntries) {
if (infoDict.has(key)) {
var value = infoDict.get(key);
// Make sure the value conforms to the spec.
if (validEntries[key](value)) {
2014-03-23 04:36:35 +09:00
docInfo[key] = (typeof value !== 'string' ?
value : stringToPDFString(value));
2012-08-04 08:11:43 +09:00
} else {
info('Bad value in document info for "' + key + '"');
}
}
}
}
2013-02-07 08:19:29 +09:00
return shadow(this, 'documentInfo', docInfo);
},
2013-02-07 08:19:29 +09:00
get fingerprint() {
var xref = this.xref, hash, fileID = '';
var idArray = xref.trailer.get('ID');
2013-10-03 17:09:06 +09:00
if (idArray && isArray(idArray) && idArray[0] && isString(idArray[0]) &&
idArray[0] !== EMPTY_FINGERPRINT) {
hash = stringToBytes(idArray[0]);
} else {
if (this.stream.ensureRange) {
this.stream.ensureRange(0,
Math.min(FINGERPRINT_FIRST_BYTES, this.stream.end));
}
hash = calculateMD5(this.stream.bytes.subarray(0,
FINGERPRINT_FIRST_BYTES), 0, FINGERPRINT_FIRST_BYTES);
2013-10-03 17:09:06 +09:00
}
for (var i = 0, n = hash.length; i < n; i++) {
var hex = hash[i].toString(16);
fileID += hex.length === 1 ? '0' + hex : hex;
}
2012-03-27 07:14:59 +09:00
2013-02-07 08:19:29 +09:00
return shadow(this, 'fingerprint', fileID);
},
2013-02-07 08:19:29 +09:00
getPage: function PDFDocument_getPage(pageIndex) {
return this.catalog.getPage(pageIndex);
},
cleanup: function PDFDocument_cleanup() {
return this.catalog.cleanup();
2011-10-25 10:13:12 +09:00
}
};
2012-04-13 04:11:22 +09:00
return PDFDocument;
2011-10-25 10:13:12 +09:00
})();