/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */ /* Copyright 2012 Mozilla Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* globals Ascii85Stream, AsciiHexStream, CCITTFaxStream, Cmd, Dict, error, FlateStream, isArray, isCmd, isDict, isInt, isName, isNum, isRef, isString, Jbig2Stream, JpegStream, JpxStream, LZWStream, Name, NullStream, PredictorStream, Ref, RunLengthStream, warn, info, StreamType */ 'use strict'; var EOF = {}; function isEOF(v) { return (v === EOF); } var Parser = (function ParserClosure() { function Parser(lexer, allowStreams, xref) { this.lexer = lexer; this.allowStreams = allowStreams; this.xref = xref; this.imageCache = { length: 0, adler32: 0, stream: null }; this.refill(); } Parser.prototype = { refill: function Parser_refill() { this.buf1 = this.lexer.getObj(); this.buf2 = this.lexer.getObj(); }, shift: function Parser_shift() { if (isCmd(this.buf2, 'ID')) { this.buf1 = this.buf2; this.buf2 = null; } else { this.buf1 = this.buf2; this.buf2 = this.lexer.getObj(); } }, getObj: function Parser_getObj(cipherTransform) { var buf1 = this.buf1; this.shift(); if (buf1 instanceof Cmd) { switch (buf1.cmd) { case 'BI': // inline image return this.makeInlineImage(cipherTransform); case '[': // array var array = []; while (!isCmd(this.buf1, ']') && !isEOF(this.buf1)) { array.push(this.getObj(cipherTransform)); } if (isEOF(this.buf1)) { error('End of file inside array'); } this.shift(); return array; case '<<': // dictionary or stream var dict = new Dict(this.xref); while (!isCmd(this.buf1, '>>') && !isEOF(this.buf1)) { if (!isName(this.buf1)) { info('Malformed dictionary: key must be a name object'); this.shift(); continue; } var key = this.buf1.name; this.shift(); if (isEOF(this.buf1)) { break; } dict.set(key, this.getObj(cipherTransform)); } if (isEOF(this.buf1)) { error('End of file inside dictionary'); } // Stream objects are not allowed inside content streams or // object streams. if (isCmd(this.buf2, 'stream')) { return (this.allowStreams ? this.makeStream(dict, cipherTransform) : dict); } this.shift(); return dict; default: // simple object return buf1; } } if (isInt(buf1)) { // indirect reference or integer var num = buf1; if (isInt(this.buf1) && isCmd(this.buf2, 'R')) { var ref = new Ref(num, this.buf1); this.shift(); this.shift(); return ref; } return num; } if (isString(buf1)) { // string var str = buf1; if (cipherTransform) { str = cipherTransform.decryptString(str); } return str; } // simple object return buf1; }, makeInlineImage: function Parser_makeInlineImage(cipherTransform) { var lexer = this.lexer; var stream = lexer.stream; // parse dictionary var dict = new Dict(null); while (!isCmd(this.buf1, 'ID') && !isEOF(this.buf1)) { if (!isName(this.buf1)) { error('Dictionary key must be a name object'); } var key = this.buf1.name; this.shift(); if (isEOF(this.buf1)) { break; } dict.set(key, this.getObj(cipherTransform)); } // parse image stream var startPos = stream.pos; // searching for the /EI\s/ var state = 0, ch, i, ii; while (state != 4 && (ch = stream.getByte()) !== -1) { switch (ch | 0) { case 0x20: case 0x0D: case 0x0A: // let's check next five bytes to be ASCII... just be sure var followingBytes = stream.peekBytes(5); for (i = 0, ii = followingBytes.length; i < ii; i++) { ch = followingBytes[i]; if (ch !== 0x0A && ch !== 0x0D && (ch < 0x20 || ch > 0x7F)) { // not a LF, CR, SPACE or any visible ASCII character state = 0; break; // some binary stuff found, resetting the state } } state = (state === 3 ? 4 : 0); break; case 0x45: state = 2; break; case 0x49: state = (state === 2 ? 3 : 0); break; default: state = 0; break; } } var length = (stream.pos - 4) - startPos; var imageStream = stream.makeSubStream(startPos, length, dict); // trying to cache repeat images, first we are trying to "warm up" caching // using length, then comparing adler32 var MAX_LENGTH_TO_CACHE = 1000; var cacheImage = false, adler32; if (length < MAX_LENGTH_TO_CACHE && this.imageCache.length === length) { var imageBytes = imageStream.getBytes(); imageStream.reset(); var a = 1; var b = 0; for (i = 0, ii = imageBytes.length; i < ii; ++i) { a = (a + (imageBytes[i] & 0xff)) % 65521; b = (b + a) % 65521; } adler32 = (b << 16) | a; if (this.imageCache.stream && this.imageCache.adler32 === adler32) { this.buf2 = Cmd.get('EI'); this.shift(); this.imageCache.stream.reset(); return this.imageCache.stream; } cacheImage = true; } if (!cacheImage && !this.imageCache.stream) { this.imageCache.length = length; this.imageCache.stream = null; } if (cipherTransform) { imageStream = cipherTransform.createStream(imageStream, length); } imageStream = this.filter(imageStream, dict, length); imageStream.dict = dict; if (cacheImage) { imageStream.cacheKey = 'inline_' + length + '_' + adler32; this.imageCache.adler32 = adler32; this.imageCache.stream = imageStream; } this.buf2 = Cmd.get('EI'); this.shift(); return imageStream; }, fetchIfRef: function Parser_fetchIfRef(obj) { // not relying on the xref.fetchIfRef -- xref might not be set return (isRef(obj) ? this.xref.fetch(obj) : obj); }, makeStream: function Parser_makeStream(dict, cipherTransform) { var lexer = this.lexer; var stream = lexer.stream; // get stream start position lexer.skipToNextLine(); var pos = stream.pos - 1; // get length var length = this.fetchIfRef(dict.get('Length')); if (!isInt(length)) { info('Bad ' + length + ' attribute in stream'); length = 0; } // skip over the stream data stream.pos = pos + length; lexer.nextChar(); this.shift(); // '>>' this.shift(); // 'stream' if (!isCmd(this.buf1, 'endstream')) { // bad stream length, scanning for endstream stream.pos = pos; var SCAN_BLOCK_SIZE = 2048; var ENDSTREAM_SIGNATURE_LENGTH = 9; var ENDSTREAM_SIGNATURE = [0x65, 0x6E, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6D]; var skipped = 0, found = false, i, j; while (stream.pos < stream.end) { var scanBytes = stream.peekBytes(SCAN_BLOCK_SIZE); var scanLength = scanBytes.length - ENDSTREAM_SIGNATURE_LENGTH; if (scanLength <= 0) { break; } found = false; for (i = 0, j = 0; i < scanLength; i++) { var b = scanBytes[i]; if (b !== ENDSTREAM_SIGNATURE[j]) { i -= j; j = 0; } else { j++; if (j >= ENDSTREAM_SIGNATURE_LENGTH) { i++; found = true; break; } } } if (found) { skipped += i - ENDSTREAM_SIGNATURE_LENGTH; stream.pos += i - ENDSTREAM_SIGNATURE_LENGTH; break; } skipped += scanLength; stream.pos += scanLength; } if (!found) { error('Missing endstream'); } length = skipped; lexer.nextChar(); this.shift(); this.shift(); } this.shift(); // 'endstream' stream = stream.makeSubStream(pos, length, dict); if (cipherTransform) { stream = cipherTransform.createStream(stream, length); } stream = this.filter(stream, dict, length); stream.dict = dict; return stream; }, filter: function Parser_filter(stream, dict, length) { var filter = this.fetchIfRef(dict.get('Filter', 'F')); var params = this.fetchIfRef(dict.get('DecodeParms', 'DP')); if (isName(filter)) { return this.makeFilter(stream, filter.name, length, params); } var maybeLength = length; if (isArray(filter)) { var filterArray = filter; var paramsArray = params; for (var i = 0, ii = filterArray.length; i < ii; ++i) { filter = filterArray[i]; if (!isName(filter)) { error('Bad filter name: ' + filter); } params = null; if (isArray(paramsArray) && (i in paramsArray)) { params = paramsArray[i]; } stream = this.makeFilter(stream, filter.name, maybeLength, params); // after the first stream the length variable is invalid maybeLength = null; } } return stream; }, makeFilter: function Parser_makeFilter(stream, name, maybeLength, params) { if (stream.dict.get('Length') === 0) { return new NullStream(stream); } try { var xrefStreamStats = this.xref.stats.streamTypes; if (name == 'FlateDecode' || name == 'Fl') { xrefStreamStats[StreamType.FLATE] = true; if (params) { return new PredictorStream(new FlateStream(stream, maybeLength), maybeLength, params); } return new FlateStream(stream, maybeLength); } if (name == 'LZWDecode' || name == 'LZW') { xrefStreamStats[StreamType.LZW] = true; var earlyChange = 1; if (params) { if (params.has('EarlyChange')) { earlyChange = params.get('EarlyChange'); } return new PredictorStream( new LZWStream(stream, maybeLength, earlyChange), maybeLength, params); } return new LZWStream(stream, maybeLength, earlyChange); } if (name == 'DCTDecode' || name == 'DCT') { xrefStreamStats[StreamType.DCT] = true; return new JpegStream(stream, maybeLength, stream.dict, this.xref); } if (name == 'JPXDecode' || name == 'JPX') { xrefStreamStats[StreamType.JPX] = true; return new JpxStream(stream, maybeLength, stream.dict); } if (name == 'ASCII85Decode' || name == 'A85') { xrefStreamStats[StreamType.A85] = true; return new Ascii85Stream(stream, maybeLength); } if (name == 'ASCIIHexDecode' || name == 'AHx') { xrefStreamStats[StreamType.AHX] = true; return new AsciiHexStream(stream, maybeLength); } if (name == 'CCITTFaxDecode' || name == 'CCF') { xrefStreamStats[StreamType.CCF] = true; return new CCITTFaxStream(stream, maybeLength, params); } if (name == 'RunLengthDecode' || name == 'RL') { xrefStreamStats[StreamType.RL] = true; return new RunLengthStream(stream, maybeLength); } if (name == 'JBIG2Decode') { xrefStreamStats[StreamType.JBIG] = true; return new Jbig2Stream(stream, maybeLength, stream.dict); } warn('filter "' + name + '" not supported yet'); return stream; } catch (ex) { warn('Invalid stream: \"' + ex + '\"'); return new NullStream(stream); } } }; return Parser; })(); var Lexer = (function LexerClosure() { function Lexer(stream, knownCommands) { this.stream = stream; this.nextChar(); // While lexing, we build up many strings one char at a time. Using += for // this can result in lots of garbage strings. It's better to build an // array of single-char strings and then join() them together at the end. // And reusing a single array (i.e. |this.strBuf|) over and over for this // purpose uses less memory than using a new array for each string. this.strBuf = []; // The PDFs might have "glued" commands with other commands, operands or // literals, e.g. "q1". The knownCommands is a dictionary of the valid // commands and their prefixes. The prefixes are built the following way: // if there a command that is a prefix of the other valid command or // literal (e.g. 'f' and 'false') the following prefixes must be included, // 'fa', 'fal', 'fals'. The prefixes are not needed, if the command has no // other commands or literals as a prefix. The knowCommands is optional. this.knownCommands = knownCommands; } Lexer.isSpace = function Lexer_isSpace(ch) { // Space is one of the following characters: SPACE, TAB, CR or LF. return (ch === 0x20 || ch === 0x09 || ch === 0x0D || ch === 0x0A); }; // A '1' in this array means the character is white space. A '1' or // '2' means the character ends a name or command. var specialChars = [ 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx ]; function toHexDigit(ch) { if (ch >= 0x30 && ch <= 0x39) { // '0'-'9' return ch & 0x0F; } if ((ch >= 0x41 && ch <= 0x46) || (ch >= 0x61 && ch <= 0x66)) { // 'A'-'F', 'a'-'f' return (ch & 0x0F) + 9; } return -1; } Lexer.prototype = { nextChar: function Lexer_nextChar() { return (this.currentChar = this.stream.getByte()); }, peekChar: function Lexer_peekChar() { return this.stream.peekBytes(1)[0]; }, getNumber: function Lexer_getNumber() { var ch = this.currentChar; var eNotation = false; var divideBy = 0; // different from 0 if it's a floating point value var sign = 1; if (ch === 0x2D) { // '-' sign = -1; ch = this.nextChar(); } else if (ch === 0x2B) { // '+' ch = this.nextChar(); } if (ch === 0x2E) { // '.' divideBy = 10; ch = this.nextChar(); } if (ch < 0x30 || ch > 0x39) { // '0' - '9' error('Invalid number: ' + String.fromCharCode(ch)); return 0; } var baseValue = ch - 0x30; // '0' var powerValue = 0; var powerValueSign = 1; while ((ch = this.nextChar()) >= 0) { if (0x30 <= ch && ch <= 0x39) { // '0' - '9' var currentDigit = ch - 0x30; // '0' if (eNotation) { // We are after an 'e' or 'E' powerValue = powerValue * 10 + currentDigit; } else { if (divideBy !== 0) { // We are after a point divideBy *= 10; } baseValue = baseValue * 10 + currentDigit; } } else if (ch === 0x2E) { // '.' if (divideBy === 0) { divideBy = 1; } else { // A number can have only one '.' break; } } else if (ch === 0x2D) { // '-' // ignore minus signs in the middle of numbers to match // Adobe's behavior warn('Badly formated number'); } else if (ch === 0x45 || ch === 0x65) { // 'E', 'e' // 'E' can be either a scientific notation or the beginning of a new // operator ch = this.peekChar(); if (ch === 0x2B || ch === 0x2D) { // '+', '-' powerValueSign = (ch === 0x2D) ? -1 : 1; this.nextChar(); // Consume the sign character } else if (ch < 0x30 || ch > 0x39) { // '0' - '9' // The 'E' must be the beginning of a new operator break; } eNotation = true; } else { // the last character doesn't belong to us break; } } if (divideBy !== 0) { baseValue /= divideBy; } if (eNotation) { baseValue *= Math.pow(10, powerValueSign * powerValue); } return sign * baseValue; }, getString: function Lexer_getString() { var numParen = 1; var done = false; var strBuf = this.strBuf; strBuf.length = 0; var ch = this.nextChar(); while (true) { var charBuffered = false; switch (ch | 0) { case -1: warn('Unterminated string'); done = true; break; case 0x28: // '(' ++numParen; strBuf.push('('); break; case 0x29: // ')' if (--numParen === 0) { this.nextChar(); // consume strings ')' done = true; } else { strBuf.push(')'); } break; case 0x5C: // '\\' ch = this.nextChar(); switch (ch) { case -1: warn('Unterminated string'); done = true; break; case 0x6E: // 'n' strBuf.push('\n'); break; case 0x72: // 'r' strBuf.push('\r'); break; case 0x74: // 't' strBuf.push('\t'); break; case 0x62: // 'b' strBuf.push('\b'); break; case 0x66: // 'f' strBuf.push('\f'); break; case 0x5C: // '\' case 0x28: // '(' case 0x29: // ')' strBuf.push(String.fromCharCode(ch)); break; case 0x30: case 0x31: case 0x32: case 0x33: // '0'-'3' case 0x34: case 0x35: case 0x36: case 0x37: // '4'-'7' var x = ch & 0x0F; ch = this.nextChar(); charBuffered = true; if (ch >= 0x30 && ch <= 0x37) { // '0'-'7' x = (x << 3) + (ch & 0x0F); ch = this.nextChar(); if (ch >= 0x30 && ch <= 0x37) { // '0'-'7' charBuffered = false; x = (x << 3) + (ch & 0x0F); } } strBuf.push(String.fromCharCode(x)); break; case 0x0D: // CR if (this.peekChar() === 0x0A) { // LF this.nextChar(); } break; case 0x0A: // LF break; default: strBuf.push(String.fromCharCode(ch)); break; } break; default: strBuf.push(String.fromCharCode(ch)); break; } if (done) { break; } if (!charBuffered) { ch = this.nextChar(); } } return strBuf.join(''); }, getName: function Lexer_getName() { var ch; var strBuf = this.strBuf; strBuf.length = 0; while ((ch = this.nextChar()) >= 0 && !specialChars[ch]) { if (ch === 0x23) { // '#' ch = this.nextChar(); var x = toHexDigit(ch); if (x != -1) { var x2 = toHexDigit(this.nextChar()); if (x2 === -1) { error('Illegal digit in hex char in name: ' + x2); } strBuf.push(String.fromCharCode((x << 4) | x2)); } else { strBuf.push('#', String.fromCharCode(ch)); } } else { strBuf.push(String.fromCharCode(ch)); } } if (strBuf.length > 128) { error('Warning: name token is longer than allowed by the spec: ' + strBuf.length); } return Name.get(strBuf.join('')); }, getHexString: function Lexer_getHexString() { var strBuf = this.strBuf; strBuf.length = 0; var ch = this.currentChar; var isFirstHex = true; var firstDigit; var secondDigit; while (true) { if (ch < 0) { warn('Unterminated hex string'); break; } else if (ch === 0x3E) { // '>' this.nextChar(); break; } else if (specialChars[ch] === 1) { ch = this.nextChar(); continue; } else { if (isFirstHex) { firstDigit = toHexDigit(ch); if (firstDigit === -1) { warn('Ignoring invalid character "' + ch + '" in hex string'); ch = this.nextChar(); continue; } } else { secondDigit = toHexDigit(ch); if (secondDigit === -1) { warn('Ignoring invalid character "' + ch + '" in hex string'); ch = this.nextChar(); continue; } strBuf.push(String.fromCharCode((firstDigit << 4) | secondDigit)); } isFirstHex = !isFirstHex; ch = this.nextChar(); } } return strBuf.join(''); }, getObj: function Lexer_getObj() { // skip whitespace and comments var comment = false; var ch = this.currentChar; while (true) { if (ch < 0) { return EOF; } if (comment) { if (ch === 0x0A || ch === 0x0D) { // LF, CR comment = false; } } else if (ch === 0x25) { // '%' comment = true; } else if (specialChars[ch] !== 1) { break; } ch = this.nextChar(); } // start reading token switch (ch | 0) { case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: // '0'-'4' case 0x35: case 0x36: case 0x37: case 0x38: case 0x39: // '5'-'9' case 0x2B: case 0x2D: case 0x2E: // '+', '-', '.' return this.getNumber(); case 0x28: // '(' return this.getString(); case 0x2F: // '/' return this.getName(); // array punctuation case 0x5B: // '[' this.nextChar(); return Cmd.get('['); case 0x5D: // ']' this.nextChar(); return Cmd.get(']'); // hex string or dict punctuation case 0x3C: // '<' ch = this.nextChar(); if (ch === 0x3C) { // dict punctuation this.nextChar(); return Cmd.get('<<'); } return this.getHexString(); // dict punctuation case 0x3E: // '>' ch = this.nextChar(); if (ch === 0x3E) { this.nextChar(); return Cmd.get('>>'); } return Cmd.get('>'); case 0x7B: // '{' this.nextChar(); return Cmd.get('{'); case 0x7D: // '}' this.nextChar(); return Cmd.get('}'); case 0x29: // ')' error('Illegal character: ' + ch); break; } // command var str = String.fromCharCode(ch); var knownCommands = this.knownCommands; var knownCommandFound = knownCommands && knownCommands[str] !== undefined; while ((ch = this.nextChar()) >= 0 && !specialChars[ch]) { // stop if known command is found and next character does not make // the str a command var possibleCommand = str + String.fromCharCode(ch); if (knownCommandFound && knownCommands[possibleCommand] === undefined) { break; } if (str.length === 128) { error('Command token too long: ' + str.length); } str = possibleCommand; knownCommandFound = knownCommands && knownCommands[str] !== undefined; } if (str === 'true') { return true; } if (str === 'false') { return false; } if (str === 'null') { return null; } return Cmd.get(str); }, skipToNextLine: function Lexer_skipToNextLine() { var ch = this.currentChar; while (ch >= 0) { if (ch === 0x0D) { // CR ch = this.nextChar(); if (ch === 0x0A) { // LF this.nextChar(); } break; } else if (ch === 0x0A) { // LF this.nextChar(); break; } ch = this.nextChar(); } } }; return Lexer; })(); var Linearization = (function LinearizationClosure() { function Linearization(stream) { this.parser = new Parser(new Lexer(stream), false, null); var obj1 = this.parser.getObj(); var obj2 = this.parser.getObj(); var obj3 = this.parser.getObj(); this.linDict = this.parser.getObj(); if (isInt(obj1) && isInt(obj2) && isCmd(obj3, 'obj') && isDict(this.linDict)) { var obj = this.linDict.get('Linearized'); if (!(isNum(obj) && obj > 0)) { this.linDict = null; } } } Linearization.prototype = { getInt: function Linearization_getInt(name) { var linDict = this.linDict; var obj; if (isDict(linDict) && isInt(obj = linDict.get(name)) && obj > 0) { return obj; } error('"' + name + '" field in linearization table is invalid'); }, getHint: function Linearization_getHint(index) { var linDict = this.linDict; var obj1, obj2; if (isDict(linDict) && isArray(obj1 = linDict.get('H')) && obj1.length >= 2 && isInt(obj2 = obj1[index]) && obj2 > 0) { return obj2; } error('Hints table in linearization table is invalid: ' + index); }, get length() { if (!isDict(this.linDict)) { return 0; } return this.getInt('L'); }, get hintsOffset() { return this.getHint(0); }, get hintsLength() { return this.getHint(1); }, get hintsOffset2() { return this.getHint(2); }, get hintsLenth2() { return this.getHint(3); }, get objectNumberFirst() { return this.getInt('O'); }, get endFirst() { return this.getInt('E'); }, get numPages() { return this.getInt('N'); }, get mainXRefEntriesOffset() { return this.getInt('T'); }, get pageFirst() { return this.getInt('P'); } }; return Linearization; })();