Move the XRef from src/core/obj.js and into its own file

The size of the `src/core/obj.js` file has increased slowly over the years, and it also contains a fair amount of *distinct* functionality.
In order to improve readability and make it easier to navigate through the code, this patch moves the `XRef` into its own file.
This commit is contained in:
Jonas Jenwald 2021-04-13 18:26:07 +02:00
parent 24e5ecdf76
commit e8750cfe95
3 changed files with 889 additions and 856 deletions

View File

@ -34,7 +34,6 @@ import {
Util, Util,
warn, warn,
} from "../shared/util.js"; } from "../shared/util.js";
import { Catalog, XRef } from "./obj.js";
import { import {
clearPrimitiveCaches, clearPrimitiveCaches,
Dict, Dict,
@ -55,12 +54,14 @@ import {
import { NullStream, Stream, StreamsSequenceStream } from "./stream.js"; import { NullStream, Stream, StreamsSequenceStream } from "./stream.js";
import { AnnotationFactory } from "./annotation.js"; import { AnnotationFactory } from "./annotation.js";
import { calculateMD5 } from "./crypto.js"; import { calculateMD5 } from "./crypto.js";
import { Catalog } from "./obj.js";
import { Linearization } from "./parser.js"; import { Linearization } from "./parser.js";
import { ObjectLoader } from "./object_loader.js"; import { ObjectLoader } from "./object_loader.js";
import { OperatorList } from "./operator_list.js"; import { OperatorList } from "./operator_list.js";
import { PartialEvaluator } from "./evaluator.js"; import { PartialEvaluator } from "./evaluator.js";
import { StructTreePage } from "./struct_tree.js"; import { StructTreePage } from "./struct_tree.js";
import { XFAFactory } from "./xfa/factory.js"; import { XFAFactory } from "./xfa/factory.js";
import { XRef } from "./xref.js";
const DEFAULT_USER_UNIT = 1.0; const DEFAULT_USER_UNIT = 1.0;
const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792]; const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792];

View File

@ -12,17 +12,14 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* eslint-disable no-var */
import { import {
assert,
bytesToString, bytesToString,
createPromiseCapability, createPromiseCapability,
createValidAbsoluteUrl, createValidAbsoluteUrl,
DocumentActionEventType, DocumentActionEventType,
FormatError, FormatError,
info, info,
InvalidPDFException,
isBool, isBool,
isNum, isNum,
isString, isString,
@ -35,15 +32,12 @@ import {
} from "../shared/util.js"; } from "../shared/util.js";
import { import {
clearPrimitiveCaches, clearPrimitiveCaches,
Cmd,
Dict, Dict,
isCmd,
isDict, isDict,
isName, isName,
isRef, isRef,
isRefsEqual, isRefsEqual,
isStream, isStream,
Ref,
RefSet, RefSet,
RefSetCache, RefSetCache,
} from "./primitives.js"; } from "./primitives.js";
@ -51,12 +45,8 @@ import {
collectActions, collectActions,
MissingDataException, MissingDataException,
toRomanNumerals, toRomanNumerals,
XRefEntryException,
XRefParseException,
} from "./core_utils.js"; } from "./core_utils.js";
import { Lexer, Parser } from "./parser.js";
import { NameTree, NumberTree } from "./name_number_tree.js"; import { NameTree, NumberTree } from "./name_number_tree.js";
import { CipherTransformFactory } from "./crypto.js";
import { ColorSpace } from "./colorspace.js"; import { ColorSpace } from "./colorspace.js";
import { FileSpec } from "./file_spec.js"; import { FileSpec } from "./file_spec.js";
import { GlobalImageCache } from "./image_utils.js"; import { GlobalImageCache } from "./image_utils.js";
@ -1426,848 +1416,4 @@ class Catalog {
} }
} }
var XRef = (function XRefClosure() { export { Catalog };
// eslint-disable-next-line no-shadow
function XRef(stream, pdfManager) {
this.stream = stream;
this.pdfManager = pdfManager;
this.entries = [];
this.xrefstms = Object.create(null);
this._cacheMap = new Map(); // Prepare the XRef cache.
this.stats = {
streamTypes: Object.create(null),
fontTypes: Object.create(null),
};
this._newRefNum = null;
}
XRef.prototype = {
getNewRef: function XRef_getNewRef() {
if (this._newRefNum === null) {
this._newRefNum = this.entries.length;
}
return Ref.get(this._newRefNum++, 0);
},
resetNewRef: function XRef_resetNewRef() {
this._newRefNum = null;
},
setStartXRef: function XRef_setStartXRef(startXRef) {
// Store the starting positions of xref tables as we process them
// so we can recover from missing data errors
this.startXRefQueue = [startXRef];
},
parse: function XRef_parse(recoveryMode) {
var trailerDict;
if (!recoveryMode) {
trailerDict = this.readXRef();
} else {
warn("Indexing all PDF objects");
trailerDict = this.indexObjects();
}
trailerDict.assignXref(this);
this.trailer = trailerDict;
let encrypt;
try {
encrypt = trailerDict.get("Encrypt");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn(`XRef.parse - Invalid "Encrypt" reference: "${ex}".`);
}
if (isDict(encrypt)) {
var ids = trailerDict.get("ID");
var fileId = ids && ids.length ? ids[0] : "";
// The 'Encrypt' dictionary itself should not be encrypted, and by
// setting `suppressEncryption` we can prevent an infinite loop inside
// of `XRef_fetchUncompressed` if the dictionary contains indirect
// objects (fixes issue7665.pdf).
encrypt.suppressEncryption = true;
this.encrypt = new CipherTransformFactory(
encrypt,
fileId,
this.pdfManager.password
);
}
// Get the root dictionary (catalog) object, and do some basic validation.
let root;
try {
root = trailerDict.get("Root");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn(`XRef.parse - Invalid "Root" reference: "${ex}".`);
}
if (isDict(root) && root.has("Pages")) {
this.root = root;
} else {
if (!recoveryMode) {
throw new XRefParseException();
}
throw new FormatError("Invalid root reference");
}
},
processXRefTable: function XRef_processXRefTable(parser) {
if (!("tableState" in this)) {
// Stores state of the table as we process it so we can resume
// from middle of table in case of missing data error
this.tableState = {
entryNum: 0,
streamPos: parser.lexer.stream.pos,
parserBuf1: parser.buf1,
parserBuf2: parser.buf2,
};
}
var obj = this.readXRefTable(parser);
// Sanity check
if (!isCmd(obj, "trailer")) {
throw new FormatError(
"Invalid XRef table: could not find trailer dictionary"
);
}
// Read trailer dictionary, e.g.
// trailer
// << /Size 22
// /Root 20R
// /Info 10R
// /ID [ <81b14aafa313db63dbd6f981e49f94f4> ]
// >>
// The parser goes through the entire stream << ... >> and provides
// a getter interface for the key-value table
var dict = parser.getObj();
// The pdflib PDF generator can generate a nested trailer dictionary
if (!isDict(dict) && dict.dict) {
dict = dict.dict;
}
if (!isDict(dict)) {
throw new FormatError(
"Invalid XRef table: could not parse trailer dictionary"
);
}
delete this.tableState;
return dict;
},
readXRefTable: function XRef_readXRefTable(parser) {
// Example of cross-reference table:
// xref
// 0 1 <-- subsection header (first obj #, obj count)
// 0000000000 65535 f <-- actual object (offset, generation #, f/n)
// 23 2 <-- subsection header ... and so on ...
// 0000025518 00002 n
// 0000025635 00000 n
// trailer
// ...
var stream = parser.lexer.stream;
var tableState = this.tableState;
stream.pos = tableState.streamPos;
parser.buf1 = tableState.parserBuf1;
parser.buf2 = tableState.parserBuf2;
// Outer loop is over subsection headers
var obj;
while (true) {
if (!("firstEntryNum" in tableState) || !("entryCount" in tableState)) {
if (isCmd((obj = parser.getObj()), "trailer")) {
break;
}
tableState.firstEntryNum = obj;
tableState.entryCount = parser.getObj();
}
var first = tableState.firstEntryNum;
var count = tableState.entryCount;
if (!Number.isInteger(first) || !Number.isInteger(count)) {
throw new FormatError(
"Invalid XRef table: wrong types in subsection header"
);
}
// Inner loop is over objects themselves
for (var i = tableState.entryNum; i < count; i++) {
tableState.streamPos = stream.pos;
tableState.entryNum = i;
tableState.parserBuf1 = parser.buf1;
tableState.parserBuf2 = parser.buf2;
var entry = {};
entry.offset = parser.getObj();
entry.gen = parser.getObj();
var type = parser.getObj();
if (type instanceof Cmd) {
switch (type.cmd) {
case "f":
entry.free = true;
break;
case "n":
entry.uncompressed = true;
break;
}
}
// Validate entry obj
if (
!Number.isInteger(entry.offset) ||
!Number.isInteger(entry.gen) ||
!(entry.free || entry.uncompressed)
) {
throw new FormatError(
`Invalid entry in XRef subsection: ${first}, ${count}`
);
}
// The first xref table entry, i.e. obj 0, should be free. Attempting
// to adjust an incorrect first obj # (fixes issue 3248 and 7229).
if (i === 0 && entry.free && first === 1) {
first = 0;
}
if (!this.entries[i + first]) {
this.entries[i + first] = entry;
}
}
tableState.entryNum = 0;
tableState.streamPos = stream.pos;
tableState.parserBuf1 = parser.buf1;
tableState.parserBuf2 = parser.buf2;
delete tableState.firstEntryNum;
delete tableState.entryCount;
}
// Sanity check: as per spec, first object must be free
if (this.entries[0] && !this.entries[0].free) {
throw new FormatError("Invalid XRef table: unexpected first object");
}
return obj;
},
processXRefStream: function XRef_processXRefStream(stream) {
if (!("streamState" in this)) {
// Stores state of the stream as we process it so we can resume
// from middle of stream in case of missing data error
var streamParameters = stream.dict;
var byteWidths = streamParameters.get("W");
var range = streamParameters.get("Index");
if (!range) {
range = [0, streamParameters.get("Size")];
}
this.streamState = {
entryRanges: range,
byteWidths,
entryNum: 0,
streamPos: stream.pos,
};
}
this.readXRefStream(stream);
delete this.streamState;
return stream.dict;
},
readXRefStream: function XRef_readXRefStream(stream) {
var i, j;
var streamState = this.streamState;
stream.pos = streamState.streamPos;
var byteWidths = streamState.byteWidths;
var typeFieldWidth = byteWidths[0];
var offsetFieldWidth = byteWidths[1];
var generationFieldWidth = byteWidths[2];
var entryRanges = streamState.entryRanges;
while (entryRanges.length > 0) {
var first = entryRanges[0];
var n = entryRanges[1];
if (!Number.isInteger(first) || !Number.isInteger(n)) {
throw new FormatError(`Invalid XRef range fields: ${first}, ${n}`);
}
if (
!Number.isInteger(typeFieldWidth) ||
!Number.isInteger(offsetFieldWidth) ||
!Number.isInteger(generationFieldWidth)
) {
throw new FormatError(
`Invalid XRef entry fields length: ${first}, ${n}`
);
}
for (i = streamState.entryNum; i < n; ++i) {
streamState.entryNum = i;
streamState.streamPos = stream.pos;
var type = 0,
offset = 0,
generation = 0;
for (j = 0; j < typeFieldWidth; ++j) {
type = (type << 8) | stream.getByte();
}
// if type field is absent, its default value is 1
if (typeFieldWidth === 0) {
type = 1;
}
for (j = 0; j < offsetFieldWidth; ++j) {
offset = (offset << 8) | stream.getByte();
}
for (j = 0; j < generationFieldWidth; ++j) {
generation = (generation << 8) | stream.getByte();
}
var entry = {};
entry.offset = offset;
entry.gen = generation;
switch (type) {
case 0:
entry.free = true;
break;
case 1:
entry.uncompressed = true;
break;
case 2:
break;
default:
throw new FormatError(`Invalid XRef entry type: ${type}`);
}
if (!this.entries[first + i]) {
this.entries[first + i] = entry;
}
}
streamState.entryNum = 0;
streamState.streamPos = stream.pos;
entryRanges.splice(0, 2);
}
},
indexObjects: function XRef_indexObjects() {
// Simple scan through the PDF content to find objects,
// trailers and XRef streams.
var TAB = 0x9,
LF = 0xa,
CR = 0xd,
SPACE = 0x20;
var PERCENT = 0x25,
LT = 0x3c;
function readToken(data, offset) {
var token = "",
ch = data[offset];
while (ch !== LF && ch !== CR && ch !== LT) {
if (++offset >= data.length) {
break;
}
token += String.fromCharCode(ch);
ch = data[offset];
}
return token;
}
function skipUntil(data, offset, what) {
var length = what.length,
dataLength = data.length;
var skipped = 0;
// finding byte sequence
while (offset < dataLength) {
var i = 0;
while (i < length && data[offset + i] === what[i]) {
++i;
}
if (i >= length) {
break; // sequence found
}
offset++;
skipped++;
}
return skipped;
}
var objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
const endobjRegExp = /\bendobj[\b\s]$/;
const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s<])$/;
const CHECK_CONTENT_LENGTH = 25;
var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
// prettier-ignore
var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114,
101, 102]);
const objBytes = new Uint8Array([111, 98, 106]);
var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
// Clear out any existing entries, since they may be bogus.
this.entries.length = 0;
var stream = this.stream;
stream.pos = 0;
var buffer = stream.getBytes();
var position = stream.start,
length = buffer.length;
var trailers = [],
xrefStms = [];
while (position < length) {
var ch = buffer[position];
if (ch === TAB || ch === LF || ch === CR || ch === SPACE) {
++position;
continue;
}
if (ch === PERCENT) {
// %-comment
do {
++position;
if (position >= length) {
break;
}
ch = buffer[position];
} while (ch !== LF && ch !== CR);
continue;
}
var token = readToken(buffer, position);
var m;
if (
token.startsWith("xref") &&
(token.length === 4 || /\s/.test(token[4]))
) {
position += skipUntil(buffer, position, trailerBytes);
trailers.push(position);
position += skipUntil(buffer, position, startxrefBytes);
} else if ((m = objRegExp.exec(token))) {
const num = m[1] | 0,
gen = m[2] | 0;
if (!this.entries[num] || this.entries[num].gen === gen) {
this.entries[num] = {
offset: position - stream.start,
gen,
uncompressed: true,
};
}
let contentLength,
startPos = position + token.length;
// Find the next "obj" string, rather than "endobj", to ensure that
// we won't skip over a new 'obj' operator in corrupt files where
// 'endobj' operators are missing (fixes issue9105_reduced.pdf).
while (startPos < buffer.length) {
const endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4;
contentLength = endPos - position;
const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos);
const tokenStr = bytesToString(buffer.subarray(checkPos, endPos));
// Check if the current object ends with an 'endobj' operator.
if (endobjRegExp.test(tokenStr)) {
break;
} else {
// Check if an "obj" occurrence is actually a new object,
// i.e. the current object is missing the 'endobj' operator.
const objToken = nestedObjRegExp.exec(tokenStr);
if (objToken && objToken[1]) {
warn(
'indexObjects: Found new "obj" inside of another "obj", ' +
'caused by missing "endobj" -- trying to recover.'
);
contentLength -= objToken[1].length;
break;
}
}
startPos = endPos;
}
const content = buffer.subarray(position, position + contentLength);
// checking XRef stream suspect
// (it shall have '/XRef' and next char is not a letter)
var xrefTagOffset = skipUntil(content, 0, xrefBytes);
if (
xrefTagOffset < contentLength &&
content[xrefTagOffset + 5] < 64
) {
xrefStms.push(position - stream.start);
this.xrefstms[position - stream.start] = 1; // Avoid recursion
}
position += contentLength;
} else if (
token.startsWith("trailer") &&
(token.length === 7 || /\s/.test(token[7]))
) {
trailers.push(position);
position += skipUntil(buffer, position, startxrefBytes);
} else {
position += token.length + 1;
}
}
// reading XRef streams
for (let i = 0, ii = xrefStms.length; i < ii; ++i) {
this.startXRefQueue.push(xrefStms[i]);
this.readXRef(/* recoveryMode */ true);
}
// finding main trailer
let trailerDict;
for (let i = 0, ii = trailers.length; i < ii; ++i) {
stream.pos = trailers[i];
const parser = new Parser({
lexer: new Lexer(stream),
xref: this,
allowStreams: true,
recoveryMode: true,
});
var obj = parser.getObj();
if (!isCmd(obj, "trailer")) {
continue;
}
// read the trailer dictionary
const dict = parser.getObj();
if (!isDict(dict)) {
continue;
}
// Do some basic validation of the trailer/root dictionary candidate.
try {
const rootDict = dict.get("Root");
if (!(rootDict instanceof Dict)) {
continue;
}
const pagesDict = rootDict.get("Pages");
if (!(pagesDict instanceof Dict)) {
continue;
}
const pagesCount = pagesDict.get("Count");
if (!Number.isInteger(pagesCount)) {
continue;
}
// The top-level /Pages dictionary isn't obviously corrupt.
} catch (ex) {
continue;
}
// taking the first one with 'ID'
if (dict.has("ID")) {
return dict;
}
// The current dictionary is a candidate, but continue searching.
trailerDict = dict;
}
// No trailer with 'ID', taking last one (if exists).
if (trailerDict) {
return trailerDict;
}
// nothing helps
throw new InvalidPDFException("Invalid PDF structure.");
},
readXRef: function XRef_readXRef(recoveryMode) {
var stream = this.stream;
// Keep track of already parsed XRef tables, to prevent an infinite loop
// when parsing corrupt PDF files where e.g. the /Prev entries create a
// circular dependency between tables (fixes bug1393476.pdf).
const startXRefParsedCache = new Set();
try {
while (this.startXRefQueue.length) {
var startXRef = this.startXRefQueue[0];
if (startXRefParsedCache.has(startXRef)) {
warn("readXRef - skipping XRef table since it was already parsed.");