pdf.js/src/core/parser.js
Ophir LOJKINE 4a66eccedc Rewrite Lexer_getNumber.
Now, it computes the numbers with only basic arithmetic operations, without first creating a string and then calling parseFloat.
The new function doesn't behave exactly the same as the old one.
In particular, the old behaviour was that when there was a number immediatly followed by an 'E', the 'E' was consumed. Now it's not. It allows for "glued" numbers and operators.
Also, the new function is faster and consumes less memory.
2014-02-01 21:46:09 +01:00

816 lines
25 KiB
JavaScript

/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
/* Copyright 2012 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* globals Ascii85Stream, AsciiHexStream, CCITTFaxStream, Cmd, Dict, error,
FlateStream, isArray, isCmd, isDict, isInt, isName, isNum, isRef,
isString, Jbig2Stream, JpegStream, JpxStream, LZWStream, Name,
NullStream, PredictorStream, Ref, RunLengthStream, warn, info */
'use strict';
var EOF = {};
function isEOF(v) {
return v == EOF;
}
var Parser = (function ParserClosure() {
function Parser(lexer, allowStreams, xref) {
this.lexer = lexer;
this.allowStreams = allowStreams;
this.xref = xref;
this.refill();
}
Parser.prototype = {
refill: function Parser_refill() {
this.buf1 = this.lexer.getObj();
this.buf2 = this.lexer.getObj();
},
shift: function Parser_shift() {
if (isCmd(this.buf2, 'ID')) {
this.buf1 = this.buf2;
this.buf2 = null;
} else {
this.buf1 = this.buf2;
this.buf2 = this.lexer.getObj();
}
},
getObj: function Parser_getObj(cipherTransform) {
if (isCmd(this.buf1, 'BI')) { // inline image
this.shift();
return this.makeInlineImage(cipherTransform);
}
if (isCmd(this.buf1, '[')) { // array
this.shift();
var array = [];
while (!isCmd(this.buf1, ']') && !isEOF(this.buf1))
array.push(this.getObj(cipherTransform));
if (isEOF(this.buf1))
error('End of file inside array');
this.shift();
return array;
}
if (isCmd(this.buf1, '<<')) { // dictionary or stream
this.shift();
var dict = new Dict(this.xref);
while (!isCmd(this.buf1, '>>') && !isEOF(this.buf1)) {
if (!isName(this.buf1)) {
info('Malformed dictionary, key must be a name object');
this.shift();
continue;
}
var key = this.buf1.name;
this.shift();
if (isEOF(this.buf1))
break;
dict.set(key, this.getObj(cipherTransform));
}
if (isEOF(this.buf1))
error('End of file inside dictionary');
// stream objects are not allowed inside content streams or
// object streams
if (isCmd(this.buf2, 'stream')) {
return this.allowStreams ?
this.makeStream(dict, cipherTransform) : dict;
}
this.shift();
return dict;
}
if (isInt(this.buf1)) { // indirect reference or integer
var num = this.buf1;
this.shift();
if (isInt(this.buf1) && isCmd(this.buf2, 'R')) {
var ref = new Ref(num, this.buf1);
this.shift();
this.shift();
return ref;
}
return num;
}
if (isString(this.buf1)) { // string
var str = this.buf1;
this.shift();
if (cipherTransform)
str = cipherTransform.decryptString(str);
return str;
}
// simple object
var obj = this.buf1;
this.shift();
return obj;
},
makeInlineImage: function Parser_makeInlineImage(cipherTransform) {
var lexer = this.lexer;
var stream = lexer.stream;
// parse dictionary
var dict = new Dict();
while (!isCmd(this.buf1, 'ID') && !isEOF(this.buf1)) {
if (!isName(this.buf1))
error('Dictionary key must be a name object');
var key = this.buf1.name;
this.shift();
if (isEOF(this.buf1))
break;
dict.set(key, this.getObj(cipherTransform));
}
// parse image stream
var startPos = stream.pos;
// searching for the /EI\s/
var state = 0, ch, i, ii;
while (state != 4 && (ch = stream.getByte()) !== -1) {
switch (ch | 0) {
case 0x20:
case 0x0D:
case 0x0A:
// let's check next five bytes to be ASCII... just be sure
var followingBytes = stream.peekBytes(5);
for (i = 0, ii = followingBytes.length; i < ii; i++) {
ch = followingBytes[i];
if (ch !== 0x0A && ch !== 0x0D && (ch < 0x20 || ch > 0x7F)) {
// not a LF, CR, SPACE or any visible ASCII character
state = 0;
break; // some binary stuff found, resetting the state
}
}
state = state === 3 ? 4 : 0;
break;
case 0x45:
state = 2;
break;
case 0x49:
state = state === 2 ? 3 : 0;
break;
default:
state = 0;
break;
}
}
var length = (stream.pos - 4) - startPos;
var imageStream = stream.makeSubStream(startPos, length, dict);
if (cipherTransform)
imageStream = cipherTransform.createStream(imageStream);
imageStream = this.filter(imageStream, dict, length);
imageStream.dict = dict;
this.buf2 = Cmd.get('EI');
this.shift();
return imageStream;
},
fetchIfRef: function Parser_fetchIfRef(obj) {
// not relying on the xref.fetchIfRef -- xref might not be set
return isRef(obj) ? this.xref.fetch(obj) : obj;
},
makeStream: function Parser_makeStream(dict, cipherTransform) {
var lexer = this.lexer;
var stream = lexer.stream;
// get stream start position
lexer.skipToNextLine();
var pos = stream.pos - 1;
// get length
var length = this.fetchIfRef(dict.get('Length'));
if (!isInt(length)) {
info('Bad ' + length + ' attribute in stream');
length = 0;
}
// skip over the stream data
stream.pos = pos + length;
lexer.nextChar();
this.shift(); // '>>'
this.shift(); // 'stream'
if (!isCmd(this.buf1, 'endstream')) {
// bad stream length, scanning for endstream
stream.pos = pos;
var SCAN_BLOCK_SIZE = 2048;
var ENDSTREAM_SIGNATURE_LENGTH = 9;
var ENDSTREAM_SIGNATURE = [0x65, 0x6E, 0x64, 0x73, 0x74, 0x72, 0x65,
0x61, 0x6D];
var skipped = 0, found = false;
while (stream.pos < stream.end) {
var scanBytes = stream.peekBytes(SCAN_BLOCK_SIZE);
var scanLength = scanBytes.length - ENDSTREAM_SIGNATURE_LENGTH;
var found = false, i, ii, j;
for (i = 0, j = 0; i < scanLength; i++) {
var b = scanBytes[i];
if (b !== ENDSTREAM_SIGNATURE[j]) {
i -= j;
j = 0;
} else {
j++;
if (j >= ENDSTREAM_SIGNATURE_LENGTH) {
found = true;
break;
}
}
}
if (found) {
skipped += i - ENDSTREAM_SIGNATURE_LENGTH;
stream.pos += i - ENDSTREAM_SIGNATURE_LENGTH;
break;
}
skipped += scanLength;
stream.pos += scanLength;
}
if (!found) {
error('Missing endstream');
}
length = skipped;
lexer.nextChar();
this.shift();
this.shift();
}
this.shift(); // 'endstream'
stream = stream.makeSubStream(pos, length, dict);
if (cipherTransform)
stream = cipherTransform.createStream(stream);
stream = this.filter(stream, dict, length);
stream.dict = dict;
return stream;
},
filter: function Parser_filter(stream, dict, length) {
var filter = this.fetchIfRef(dict.get('Filter', 'F'));
var params = this.fetchIfRef(dict.get('DecodeParms', 'DP'));
if (isName(filter))
return this.makeFilter(stream, filter.name, length, params);
if (isArray(filter)) {
var filterArray = filter;
var paramsArray = params;
for (var i = 0, ii = filterArray.length; i < ii; ++i) {
filter = filterArray[i];
if (!isName(filter))
error('Bad filter name: ' + filter);
params = null;
if (isArray(paramsArray) && (i in paramsArray))
params = paramsArray[i];
stream = this.makeFilter(stream, filter.name, length, params);
// after the first stream the length variable is invalid
length = null;
}
}
return stream;
},
makeFilter: function Parser_makeFilter(stream, name, length, params) {
if (stream.dict.get('Length') === 0) {
return new NullStream(stream);
}
if (name == 'FlateDecode' || name == 'Fl') {
if (params) {
return new PredictorStream(new FlateStream(stream), params);
}
return new FlateStream(stream);
}
if (name == 'LZWDecode' || name == 'LZW') {
var earlyChange = 1;
if (params) {
if (params.has('EarlyChange'))
earlyChange = params.get('EarlyChange');
return new PredictorStream(
new LZWStream(stream, earlyChange), params);
}
return new LZWStream(stream, earlyChange);
}
if (name == 'DCTDecode' || name == 'DCT') {
var bytes = stream.getBytes(length);
return new JpegStream(bytes, stream.dict, this.xref);
}
if (name == 'JPXDecode' || name == 'JPX') {
var bytes = stream.getBytes(length);
return new JpxStream(bytes, stream.dict);
}
if (name == 'ASCII85Decode' || name == 'A85') {
return new Ascii85Stream(stream);
}
if (name == 'ASCIIHexDecode' || name == 'AHx') {
return new AsciiHexStream(stream);
}
if (name == 'CCITTFaxDecode' || name == 'CCF') {
return new CCITTFaxStream(stream, params);
}
if (name == 'RunLengthDecode' || name == 'RL') {
return new RunLengthStream(stream);
}
if (name == 'JBIG2Decode') {
var bytes = stream.getBytes(length);
return new Jbig2Stream(bytes, stream.dict);
}
warn('filter "' + name + '" not supported yet');
return stream;
}
};
return Parser;
})();
var Lexer = (function LexerClosure() {
function Lexer(stream, knownCommands) {
this.stream = stream;
this.nextChar();
// While lexing, we build up many strings one char at a time. Using += for
// this can result in lots of garbage strings. It's better to build an
// array of single-char strings and then join() them together at the end.
// And reusing a single array (i.e. |this.strBuf|) over and over for this
// purpose uses less memory than using a new array for each string.
this.strBuf = [];
// The PDFs might have "glued" commands with other commands, operands or
// literals, e.g. "q1". The knownCommands is a dictionary of the valid
// commands and their prefixes. The prefixes are built the following way:
// if there a command that is a prefix of the other valid command or
// literal (e.g. 'f' and 'false') the following prefixes must be included,
// 'fa', 'fal', 'fals'. The prefixes are not needed, if the command has no
// other commands or literals as a prefix. The knowCommands is optional.
this.knownCommands = knownCommands;
}
Lexer.isSpace = function Lexer_isSpace(ch) {
// space is one of the following characters: SPACE, TAB, CR, or LF
return ch === 0x20 || ch === 0x09 || ch === 0x0D || ch === 0x0A;
};
// A '1' in this array means the character is white space. A '1' or
// '2' means the character ends a name or command.
var specialChars = [
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx
];
function toHexDigit(ch) {
if (ch >= 0x30 && ch <= 0x39) { // '0'-'9'
return ch & 0x0F;
}
if ((ch >= 0x41 && ch <= 0x46) || (ch >= 0x61 && ch <= 0x66)) {
// 'A'-'F', 'a'-'f'
return (ch & 0x0F) + 9;
}
return -1;
}
Lexer.prototype = {
nextChar: function Lexer_nextChar() {
return (this.currentChar = this.stream.getByte());
},
peekChar: function Lexer_peekChar() {
return this.stream.peekBytes(1)[0];
},
getNumber: function Lexer_getNumber() {
var ch = this.currentChar;
var eNotation = false;
var divideBy = 0; // different from 0 if it's a floating point value
var sign = 1;
if (ch === 0x2D) { // '-'
sign = -1;
ch = this.nextChar();
} else if (ch === 0x2B) { // '+'
ch = this.nextChar();
}
if (ch === 0x2E) { // '.'
divideBy = 10;
ch = this.nextChar();
}
if (ch < 0x30 || ch > 0x39) { // '0' - '9'
error('Invalid number: ' + String.fromCharCode(ch));
return 0;
}
var baseValue = ch - 0x30; // '0'
var powerValue = 0;
var powerValueSign = 1;
while ((ch = this.nextChar()) >= 0) {
if (0x30 <= ch && ch <= 0x39) { // '0' - '9'
var currentDigit = ch - 0x30; // '0'
if (eNotation) { // We are after an 'e' or 'E'
powerValue = powerValue * 10 + currentDigit;
} else {
if (divideBy !== 0) { // We are after a point
divideBy *= 10;
}
baseValue = baseValue * 10 + currentDigit;
}
} else if (ch === 0x2E) { // '.'
if (divideBy === 0) {
divideBy = 1;
} else {
// A number can have only one '.'
break;
}
} else if (ch === 0x2D) { // '-'
// ignore minus signs in the middle of numbers to match
// Adobe's behavior
warn('Badly formated number');
} else if (ch === 0x45 || ch === 0x65) { // 'E', 'e'
// 'E' can be either a scientific notation or the beginning of a new
// operator
var hasE = true;
ch = this.peekChar();
if (ch === 0x2B || ch === 0x2D) { // '+', '-'
powerValueSign = (ch === 0x2D) ? -1 : 1;
this.nextChar(); // Consume the sign character
} else if (ch < 0x30 || ch > 0x39) { // '0' - '9'
// The 'E' must be the beginning of a new operator
break;
}
eNotation = true;
} else {
// the last character doesn't belong to us
break;
}
}
if (divideBy !== 0) {
baseValue /= divideBy;
}
if (eNotation) {
baseValue *= Math.pow(10, powerValueSign * powerValue);
}
return sign * baseValue;
},
getString: function Lexer_getString() {
var numParen = 1;
var done = false;
var strBuf = this.strBuf;
strBuf.length = 0;
var ch = this.nextChar();
while (true) {
var charBuffered = false;
switch (ch | 0) {
case -1:
warn('Unterminated string');
done = true;
break;
case 0x28: // '('
++numParen;
strBuf.push('(');
break;
case 0x29: // ')'
if (--numParen === 0) {
this.nextChar(); // consume strings ')'
done = true;
} else {
strBuf.push(')');
}
break;
case 0x5C: // '\\'
ch = this.nextChar();
switch (ch) {
case -1:
warn('Unterminated string');
done = true;
break;
case 0x6E: // 'n'
strBuf.push('\n');
break;
case 0x72: // 'r'
strBuf.push('\r');
break;
case 0x74: // 't'
strBuf.push('\t');
break;
case 0x62: // 'b'
strBuf.push('\b');
break;
case 0x66: // 'f'
strBuf.push('\f');
break;
case 0x5C: // '\'
case 0x28: // '('
case 0x29: // ')'
strBuf.push(String.fromCharCode(ch));
break;
case 0x30: case 0x31: case 0x32: case 0x33: // '0'-'3'
case 0x34: case 0x35: case 0x36: case 0x37: // '4'-'7'
var x = ch & 0x0F;
ch = this.nextChar();
charBuffered = true;
if (ch >= 0x30 && ch <= 0x37) { // '0'-'7'
x = (x << 3) + (ch & 0x0F);
ch = this.nextChar();
if (ch >= 0x30 && ch <= 0x37) { // '0'-'7'
charBuffered = false;
x = (x << 3) + (ch & 0x0F);
}
}
strBuf.push(String.fromCharCode(x));
break;
case 0x0A: case 0x0D: // LF, CR
break;
default:
strBuf.push(String.fromCharCode(ch));
break;
}
break;
default:
strBuf.push(String.fromCharCode(ch));
break;
}
if (done) {
break;
}
if (!charBuffered) {
ch = this.nextChar();
}
}
return strBuf.join('');
},
getName: function Lexer_getName() {
var ch;
var strBuf = this.strBuf;
strBuf.length = 0;
while ((ch = this.nextChar()) >= 0 && !specialChars[ch]) {
if (ch === 0x23) { // '#'
ch = this.nextChar();
var x = toHexDigit(ch);
if (x != -1) {
var x2 = toHexDigit(this.nextChar());
if (x2 == -1)
error('Illegal digit in hex char in name: ' + x2);
strBuf.push(String.fromCharCode((x << 4) | x2));
} else {
strBuf.push('#', String.fromCharCode(ch));
}
} else {
strBuf.push(String.fromCharCode(ch));
}
}
if (strBuf.length > 128) {
error('Warning: name token is longer than allowed by the spec: ' +
strBuf.length);
}
return new Name(strBuf.join(''));
},
getHexString: function Lexer_getHexString() {
var strBuf = this.strBuf;
strBuf.length = 0;
var ch = this.currentChar;
var isFirstHex = true;
var firstDigit;
var secondDigit;
while (true) {
if (ch < 0) {
warn('Unterminated hex string');
break;
} else if (ch === 0x3E) { // '>'
this.nextChar();
break;
} else if (specialChars[ch] === 1) {
ch = this.nextChar();
continue;
} else {
if (isFirstHex) {
firstDigit = toHexDigit(ch);
if (firstDigit === -1) {
warn('Ignoring invalid character "' + ch + '" in hex string');
ch = this.nextChar();
continue;
}
} else {
secondDigit = toHexDigit(ch);
if (secondDigit === -1) {
warn('Ignoring invalid character "' + ch + '" in hex string');
ch = this.nextChar();
continue;
}
strBuf.push(String.fromCharCode((firstDigit << 4) | secondDigit));
}
isFirstHex = !isFirstHex;
ch = this.nextChar();
}
}
return strBuf.join('');
},
getObj: function Lexer_getObj() {
// skip whitespace and comments
var comment = false;
var ch = this.currentChar;
while (true) {
if (ch < 0) {
return EOF;
}
if (comment) {
if (ch === 0x0A || ch == 0x0D) // LF, CR
comment = false;
} else if (ch === 0x25) { // '%'
comment = true;
} else if (specialChars[ch] !== 1) {
break;
}
ch = this.nextChar();
}
// start reading token
switch (ch | 0) {
case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: // '0'-'4'
case 0x35: case 0x36: case 0x37: case 0x38: case 0x39: // '5'-'9'
case 0x2B: case 0x2D: case 0x2E: // '+', '-', '.'
return this.getNumber();
case 0x28: // '('
return this.getString();
case 0x2F: // '/'
return this.getName();
// array punctuation
case 0x5B: // '['
this.nextChar();
return Cmd.get('[');
case 0x5D: // ']'
this.nextChar();
return Cmd.get(']');
// hex string or dict punctuation
case 0x3C: // '<'
ch = this.nextChar();
if (ch === 0x3C) {
// dict punctuation
this.nextChar();
return Cmd.get('<<');
}
return this.getHexString();
// dict punctuation
case 0x3E: // '>'
ch = this.nextChar();
if (ch === 0x3E) {
this.nextChar();
return Cmd.get('>>');
}
return Cmd.get('>');
case 0x7B: // '{'
this.nextChar();
return Cmd.get('{');
case 0x7D: // '}'
this.nextChar();
return Cmd.get('}');
case 0x29: // ')'
error('Illegal character: ' + ch);
break;
}
// command
var str = String.fromCharCode(ch);
var knownCommands = this.knownCommands;
var knownCommandFound = knownCommands && (str in knownCommands);
while ((ch = this.nextChar()) >= 0 && !specialChars[ch]) {
// stop if known command is found and next character does not make
// the str a command
var possibleCommand = str + String.fromCharCode(ch);
if (knownCommandFound && !(possibleCommand in knownCommands)) {
break;
}
if (str.length == 128)
error('Command token too long: ' + str.length);
str = possibleCommand;
knownCommandFound = knownCommands && (str in knownCommands);
}
if (str == 'true')
return true;
if (str == 'false')
return false;
if (str == 'null')
return null;
return Cmd.get(str);
},
skipToNextLine: function Lexer_skipToNextLine() {
var stream = this.stream;
var ch = this.currentChar;
while (ch >= 0) {
if (ch === 0x0D) { // CR
ch = this.nextChar();
if (ch === 0x0A) { // LF
this.nextChar();
}
break;
} else if (ch === 0x0A) { // LF
this.nextChar();
break;
}
ch = this.nextChar();
}
}
};
return Lexer;
})();
var Linearization = (function LinearizationClosure() {
function Linearization(stream) {
this.parser = new Parser(new Lexer(stream), false, null);
var obj1 = this.parser.getObj();
var obj2 = this.parser.getObj();
var obj3 = this.parser.getObj();
this.linDict = this.parser.getObj();
if (isInt(obj1) && isInt(obj2) && isCmd(obj3, 'obj') &&
isDict(this.linDict)) {
var obj = this.linDict.get('Linearized');
if (!(isNum(obj) && obj > 0))
this.linDict = null;
}
}
Linearization.prototype = {
getInt: function Linearization_getInt(name) {
var linDict = this.linDict;
var obj;
if (isDict(linDict) &&
isInt(obj = linDict.get(name)) &&
obj > 0) {
return obj;
}
error('"' + name + '" field in linearization table is invalid');
},
getHint: function Linearization_getHint(index) {
var linDict = this.linDict;
var obj1, obj2;
if (isDict(linDict) &&
isArray(obj1 = linDict.get('H')) &&
obj1.length >= 2 &&
isInt(obj2 = obj1[index]) &&
obj2 > 0) {
return obj2;
}
error('Hints table in linearization table is invalid: ' + index);
},
get length() {
if (!isDict(this.linDict))
return 0;
return this.getInt('L');
},
get hintsOffset() {
return this.getHint(0);
},
get hintsLength() {
return this.getHint(1);
},
get hintsOffset2() {
return this.getHint(2);
},
get hintsLenth2() {
return this.getHint(3);
},
get objectNumberFirst() {
return this.getInt('O');
},
get endFirst() {
return this.getInt('E');
},
get numPages() {
return this.getInt('N');
},
get mainXRefEntriesOffset() {
return this.getInt('T');
},
get pageFirst() {
return this.getInt('P');
}
};
return Linearization;
})();