Re-factor the find
helper function, in src/core/document.js
, to search through the raw bytes rather than a string
During initial parsing of every PDF document we're currently creating a few `1 kB` strings, in order to find certain commands needed for initialization. This seems inefficient, not to mention completely unnecessary, since we can just as well search through the raw bytes directly instead (similar to other parts of the code-base). One small complication here is the need to support backwards search, which does add some amount of "duplication" to this function. The main benefits here are: - No longer necessary to allocate *temporary* `1 kB` strings during initial parsing, thus saving some memory. - In practice, for well-formed PDF documents, the number of iterations required to find the commands are usually very low. (For the `tracemonkey.pdf` file, there's a *total* of only 30 loop iterations.)
This commit is contained in:
parent
8b35bba347
commit
dbb82f05fc
@ -15,8 +15,8 @@
|
|||||||
/* eslint no-var: error */
|
/* eslint no-var: error */
|
||||||
|
|
||||||
import {
|
import {
|
||||||
assert, bytesToString, FormatError, info, InvalidPDFException, isArrayBuffer,
|
assert, FormatError, info, InvalidPDFException, isArrayBuffer, isArrayEqual,
|
||||||
isArrayEqual, isBool, isNum, isSpace, isString, OPS, shadow, stringToBytes,
|
isBool, isNum, isSpace, isString, OPS, shadow, stringToBytes,
|
||||||
stringToPDFString, Util, warn
|
stringToPDFString, Util, warn
|
||||||
} from '../shared/util';
|
} from '../shared/util';
|
||||||
import { Catalog, ObjectLoader, XRef } from './obj';
|
import { Catalog, ObjectLoader, XRef } from './obj';
|
||||||
@ -344,22 +344,60 @@ class Page {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const PDF_HEADER_SIGNATURE = new Uint8Array([0x25, 0x50, 0x44, 0x46, 0x2D]);
|
||||||
|
const STARTXREF_SIGNATURE = new Uint8Array([
|
||||||
|
0x73, 0x74, 0x61, 0x72, 0x74, 0x78, 0x72, 0x65, 0x66]);
|
||||||
|
const ENDOBJ_SIGNATURE = new Uint8Array([0x65, 0x6E, 0x64, 0x6F, 0x62, 0x6A]);
|
||||||
|
|
||||||
const FINGERPRINT_FIRST_BYTES = 1024;
|
const FINGERPRINT_FIRST_BYTES = 1024;
|
||||||
const EMPTY_FINGERPRINT = '\x00\x00\x00\x00\x00\x00\x00' +
|
const EMPTY_FINGERPRINT =
|
||||||
'\x00\x00\x00\x00\x00\x00\x00\x00\x00';
|
'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00';
|
||||||
|
|
||||||
function find(stream, needle, limit, backwards = false) {
|
function find(stream, signature, limit = 1024, backwards = false) {
|
||||||
|
if (typeof PDFJSDev === 'undefined' ||
|
||||||
|
PDFJSDev.test('!PRODUCTION || TESTING')) {
|
||||||
assert(limit > 0, 'The "limit" must be a positive integer.');
|
assert(limit > 0, 'The "limit" must be a positive integer.');
|
||||||
|
}
|
||||||
|
const signatureLength = signature.length;
|
||||||
|
|
||||||
const str = bytesToString(stream.peekBytes(limit));
|
const scanBytes = stream.peekBytes(limit);
|
||||||
|
const scanLength = scanBytes.length - signatureLength;
|
||||||
|
|
||||||
const index = backwards ? str.lastIndexOf(needle) : str.indexOf(needle);
|
if (scanLength <= 0) {
|
||||||
if (index === -1) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
stream.pos += index;
|
if (backwards) {
|
||||||
|
const signatureEnd = signatureLength - 1;
|
||||||
|
|
||||||
|
let pos = scanBytes.length - 1;
|
||||||
|
while (pos >= signatureEnd) {
|
||||||
|
let j = 0;
|
||||||
|
while (j < signatureLength &&
|
||||||
|
scanBytes[pos - j] === signature[signatureEnd - j]) {
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
if (j >= signatureLength) { // `signature` found.
|
||||||
|
stream.pos += (pos - signatureEnd);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
pos--;
|
||||||
|
}
|
||||||
|
} else { // forwards
|
||||||
|
let pos = 0;
|
||||||
|
while (pos <= scanLength) {
|
||||||
|
let j = 0;
|
||||||
|
while (j < signatureLength && scanBytes[pos + j] === signature[j]) {
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
if (j >= signatureLength) { // `signature` found.
|
||||||
|
stream.pos += pos;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The `PDFDocument` class holds all the (worker-thread) data of the PDF file.
|
* The `PDFDocument` class holds all the (worker-thread) data of the PDF file.
|
||||||
@ -450,13 +488,13 @@ class PDFDocument {
|
|||||||
if (this.linearization) {
|
if (this.linearization) {
|
||||||
// Find the end of the first object.
|
// Find the end of the first object.
|
||||||
stream.reset();
|
stream.reset();
|
||||||
if (find(stream, 'endobj', 1024)) {
|
if (find(stream, ENDOBJ_SIGNATURE)) {
|
||||||
startXRef = (stream.pos + 6) - stream.start;
|
startXRef = (stream.pos + 6) - stream.start;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Find `startxref` by checking backwards from the end of the file.
|
// Find `startxref` by checking backwards from the end of the file.
|
||||||
const step = 1024;
|
const step = 1024;
|
||||||
const startXRefLength = 'startxref'.length;
|
const startXRefLength = STARTXREF_SIGNATURE.length;
|
||||||
let found = false, pos = stream.end;
|
let found = false, pos = stream.end;
|
||||||
|
|
||||||
while (!found && pos > 0) {
|
while (!found && pos > 0) {
|
||||||
@ -465,7 +503,7 @@ class PDFDocument {
|
|||||||
pos = 0;
|
pos = 0;
|
||||||
}
|
}
|
||||||
stream.pos = pos;
|
stream.pos = pos;
|
||||||
found = find(stream, 'startxref', step, true);
|
found = find(stream, STARTXREF_SIGNATURE, step, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (found) {
|
if (found) {
|
||||||
@ -494,7 +532,7 @@ class PDFDocument {
|
|||||||
const stream = this.stream;
|
const stream = this.stream;
|
||||||
stream.reset();
|
stream.reset();
|
||||||
|
|
||||||
if (!find(stream, '%PDF-', 1024)) {
|
if (!find(stream, PDF_HEADER_SIGNATURE)) {
|
||||||
// May not be a PDF file, but don't throw an error and let
|
// May not be a PDF file, but don't throw an error and let
|
||||||
// parsing continue.
|
// parsing continue.
|
||||||
return;
|
return;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user