From 7d0ecee771e50a7125725a61729c63ac69f8d275 Mon Sep 17 00:00:00 2001 From: Tim van der Meij Date: Sun, 10 Mar 2019 14:09:31 +0100 Subject: [PATCH 1/5] Convert the `Parser` class in `src/core/parser.js` to ES6 syntax --- src/core/parser.js | 1229 ++++++++++++++++++++++---------------------- 1 file changed, 620 insertions(+), 609 deletions(-) diff --git a/src/core/parser.js b/src/core/parser.js index 8eb8a578c..f41951e08 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -34,7 +34,7 @@ const MAX_LENGTH_TO_CACHE = 1000; const MAX_ADLER32_LENGTH = 5552; function computeAdler32(bytes) { - let bytesLength = bytes.length; + const bytesLength = bytes.length; if (typeof PDFJSDev === 'undefined' || PDFJSDev.test('!PRODUCTION || TESTING')) { assert(bytesLength < MAX_ADLER32_LENGTH, @@ -49,659 +49,670 @@ function computeAdler32(bytes) { return ((b % 65521) << 16) | (a % 65521); } -var Parser = (function ParserClosure() { - function Parser(lexer, allowStreams, xref, recoveryMode) { +class Parser { + constructor(lexer, allowStreams, xref, recoveryMode = false) { this.lexer = lexer; this.allowStreams = allowStreams; this.xref = xref; - this.recoveryMode = recoveryMode || false; + this.recoveryMode = recoveryMode; + this.imageCache = Object.create(null); this.refill(); } - Parser.prototype = { - refill: function Parser_refill() { - this.buf1 = this.lexer.getObj(); + refill() { + this.buf1 = this.lexer.getObj(); + this.buf2 = this.lexer.getObj(); + } + + shift() { + if (isCmd(this.buf2, 'ID')) { + this.buf1 = this.buf2; + this.buf2 = null; + } else { + this.buf1 = this.buf2; this.buf2 = this.lexer.getObj(); - }, - shift: function Parser_shift() { - if (isCmd(this.buf2, 'ID')) { - this.buf1 = this.buf2; - this.buf2 = null; - } else { - this.buf1 = this.buf2; - this.buf2 = this.lexer.getObj(); - } - }, - tryShift: function Parser_tryShift() { - try { - this.shift(); - return true; - } catch (e) { - if (e instanceof MissingDataException) { - throw e; - } - // Upon failure, the caller should reset this.lexer.pos to a known good - // state and call this.shift() twice to reset the buffers. - return false; - } - }, - getObj: function Parser_getObj(cipherTransform) { - var buf1 = this.buf1; - this.shift(); + } + } - if (buf1 instanceof Cmd) { - switch (buf1.cmd) { - case 'BI': // inline image - return this.makeInlineImage(cipherTransform); - case '[': // array - var array = []; - while (!isCmd(this.buf1, ']') && !isEOF(this.buf1)) { - array.push(this.getObj(cipherTransform)); + tryShift() { + try { + this.shift(); + return true; + } catch (e) { + if (e instanceof MissingDataException) { + throw e; + } + // Upon failure, the caller should reset this.lexer.pos to a known good + // state and call this.shift() twice to reset the buffers. + return false; + } + } + + getObj(cipherTransform) { + const buf1 = this.buf1; + this.shift(); + + if (buf1 instanceof Cmd) { + switch (buf1.cmd) { + case 'BI': // inline image + return this.makeInlineImage(cipherTransform); + case '[': // array + const array = []; + while (!isCmd(this.buf1, ']') && !isEOF(this.buf1)) { + array.push(this.getObj(cipherTransform)); + } + if (isEOF(this.buf1)) { + if (!this.recoveryMode) { + throw new FormatError('End of file inside array'); } - if (isEOF(this.buf1)) { - if (!this.recoveryMode) { - throw new FormatError('End of file inside array'); - } - return array; - } - this.shift(); return array; - case '<<': // dictionary or stream - var dict = new Dict(this.xref); - while (!isCmd(this.buf1, '>>') && !isEOF(this.buf1)) { - if (!isName(this.buf1)) { - info('Malformed dictionary: key must be a name object'); - this.shift(); - continue; - } - - var key = this.buf1.name; + } + this.shift(); + return array; + case '<<': // dictionary or stream + const dict = new Dict(this.xref); + while (!isCmd(this.buf1, '>>') && !isEOF(this.buf1)) { + if (!isName(this.buf1)) { + info('Malformed dictionary: key must be a name object'); this.shift(); - if (isEOF(this.buf1)) { - break; - } - dict.set(key, this.getObj(cipherTransform)); - } - if (isEOF(this.buf1)) { - if (!this.recoveryMode) { - throw new FormatError('End of file inside dictionary'); - } - return dict; + continue; } - // Stream objects are not allowed inside content streams or - // object streams. - if (isCmd(this.buf2, 'stream')) { - return (this.allowStreams ? - this.makeStream(dict, cipherTransform) : dict); - } + const key = this.buf1.name; this.shift(); + if (isEOF(this.buf1)) { + break; + } + dict.set(key, this.getObj(cipherTransform)); + } + if (isEOF(this.buf1)) { + if (!this.recoveryMode) { + throw new FormatError('End of file inside dictionary'); + } return dict; - default: // simple object - return buf1; - } - } + } - if (Number.isInteger(buf1)) { // indirect reference or integer - var num = buf1; - if (Number.isInteger(this.buf1) && isCmd(this.buf2, 'R')) { - var ref = new Ref(num, this.buf1); + // Stream objects are not allowed inside content streams or + // object streams. + if (isCmd(this.buf2, 'stream')) { + return (this.allowStreams ? + this.makeStream(dict, cipherTransform) : dict); + } this.shift(); - this.shift(); - return ref; - } - return num; + return dict; + default: // simple object + return buf1; } + } - if (isString(buf1)) { // string - var str = buf1; - if (cipherTransform) { - str = cipherTransform.decryptString(str); - } - return str; - } - - // simple object - return buf1; - }, - /** - * Find the end of the stream by searching for the /EI\s/. - * @returns {number} The inline stream length. - */ - findDefaultInlineStreamEnd(stream) { - const E = 0x45, I = 0x49, SPACE = 0x20, LF = 0xA, CR = 0xD; - const n = 10, NUL = 0x0; - let startPos = stream.pos, state = 0, ch, maybeEIPos; - while ((ch = stream.getByte()) !== -1) { - if (state === 0) { - state = (ch === E) ? 1 : 0; - } else if (state === 1) { - state = (ch === I) ? 2 : 0; - } else { - assert(state === 2); - if (ch === SPACE || ch === LF || ch === CR) { - maybeEIPos = stream.pos; - // Let's check that the next `n` bytes are ASCII... just to be sure. - let followingBytes = stream.peekBytes(n); - for (let i = 0, ii = followingBytes.length; i < ii; i++) { - ch = followingBytes[i]; - if (ch === NUL && followingBytes[i + 1] !== NUL) { - // NUL bytes are not supposed to occur *outside* of inline - // images, but some PDF generators violate that assumption, - // thus breaking the EI detection heuristics used below. - // - // However, we can't unconditionally treat NUL bytes as "ASCII", - // since that *could* result in inline images being truncated. - // - // To attempt to address this, we'll still treat any *sequence* - // of NUL bytes as non-ASCII, but for a *single* NUL byte we'll - // continue checking the `followingBytes` (fixes issue8823.pdf). - continue; - } - if (ch !== LF && ch !== CR && (ch < SPACE || ch > 0x7F)) { - // Not a LF, CR, SPACE or any visible ASCII character, i.e. - // it's binary stuff. Resetting the state. - state = 0; - break; - } - } - if (state === 2) { - break; // Finished! - } - } else { - state = 0; - } - } - } - - if (ch === -1) { - warn('findDefaultInlineStreamEnd: ' + - 'Reached the end of the stream without finding a valid EI marker'); - if (maybeEIPos) { - warn('... trying to recover by using the last "EI" occurrence.'); - stream.skip(-(stream.pos - maybeEIPos)); // Reset the stream position. - } - } - - let endOffset = 4; - stream.skip(-endOffset); // Set the stream position to just before "EI". - ch = stream.peekByte(); - stream.skip(endOffset); // ... and remember to reset the stream position. - - // Ensure that we don't accidentally truncate the inline image, when the - // data is immediately followed by the "EI" marker (fixes issue10388.pdf). - if (!isSpace(ch)) { - endOffset--; - } - return ((stream.pos - endOffset) - startPos); - }, - /** - * Find the EOI (end-of-image) marker 0xFFD9 of the stream. - * @returns {number} The inline stream length. - */ - findDCTDecodeInlineStreamEnd: - function Parser_findDCTDecodeInlineStreamEnd(stream) { - var startPos = stream.pos, foundEOI = false, b, markerLength, length; - while ((b = stream.getByte()) !== -1) { - if (b !== 0xFF) { // Not a valid marker. - continue; - } - switch (stream.getByte()) { - case 0x00: // Byte stuffing. - // 0xFF00 appears to be a very common byte sequence in JPEG images. - break; - - case 0xFF: // Fill byte. - // Avoid skipping a valid marker, resetting the stream position. - stream.skip(-1); - break; - - case 0xD9: // EOI - foundEOI = true; - break; - - case 0xC0: // SOF0 - case 0xC1: // SOF1 - case 0xC2: // SOF2 - case 0xC3: // SOF3 - /* falls through */ - case 0xC5: // SOF5 - case 0xC6: // SOF6 - case 0xC7: // SOF7 - /* falls through */ - case 0xC9: // SOF9 - case 0xCA: // SOF10 - case 0xCB: // SOF11 - /* falls through */ - case 0xCD: // SOF13 - case 0xCE: // SOF14 - case 0xCF: // SOF15 - /* falls through */ - case 0xC4: // DHT - case 0xCC: // DAC - /* falls through */ - case 0xDA: // SOS - case 0xDB: // DQT - case 0xDC: // DNL - case 0xDD: // DRI - case 0xDE: // DHP - case 0xDF: // EXP - /* falls through */ - case 0xE0: // APP0 - case 0xE1: // APP1 - case 0xE2: // APP2 - case 0xE3: // APP3 - case 0xE4: // APP4 - case 0xE5: // APP5 - case 0xE6: // APP6 - case 0xE7: // APP7 - case 0xE8: // APP8 - case 0xE9: // APP9 - case 0xEA: // APP10 - case 0xEB: // APP11 - case 0xEC: // APP12 - case 0xED: // APP13 - case 0xEE: // APP14 - case 0xEF: // APP15 - /* falls through */ - case 0xFE: // COM - // The marker should be followed by the length of the segment. - markerLength = stream.getUint16(); - if (markerLength > 2) { - // |markerLength| contains the byte length of the marker segment, - // including its own length (2 bytes) and excluding the marker. - stream.skip(markerLength - 2); // Jump to the next marker. - } else { - // The marker length is invalid, resetting the stream position. - stream.skip(-2); - } - break; - } - if (foundEOI) { - break; - } - } - length = stream.pos - startPos; - if (b === -1) { - warn('Inline DCTDecode image stream: ' + - 'EOI marker not found, searching for /EI/ instead.'); - stream.skip(-length); // Reset the stream position. - return this.findDefaultInlineStreamEnd(stream); - } - this.inlineStreamSkipEI(stream); - return length; - }, - /** - * Find the EOD (end-of-data) marker '~>' (i.e. TILDE + GT) of the stream. - * @returns {number} The inline stream length. - */ - findASCII85DecodeInlineStreamEnd(stream) { - var TILDE = 0x7E, GT = 0x3E; - var startPos = stream.pos, ch, length; - while ((ch = stream.getByte()) !== -1) { - if (ch === TILDE) { - ch = stream.peekByte(); - // Handle corrupt PDF documents which contains whitespace "inside" of - // the EOD marker (fixes issue10614.pdf). - while (isSpace(ch)) { - stream.skip(); - ch = stream.peekByte(); - } - if (ch === GT) { - stream.skip(); - break; - } - } - } - length = stream.pos - startPos; - if (ch === -1) { - warn('Inline ASCII85Decode image stream: ' + - 'EOD marker not found, searching for /EI/ instead.'); - stream.skip(-length); // Reset the stream position. - return this.findDefaultInlineStreamEnd(stream); - } - this.inlineStreamSkipEI(stream); - return length; - }, - /** - * Find the EOD (end-of-data) marker '>' (i.e. GT) of the stream. - * @returns {number} The inline stream length. - */ - findASCIIHexDecodeInlineStreamEnd: - function Parser_findASCIIHexDecodeInlineStreamEnd(stream) { - var GT = 0x3E; - var startPos = stream.pos, ch, length; - while ((ch = stream.getByte()) !== -1) { - if (ch === GT) { - break; - } - } - length = stream.pos - startPos; - if (ch === -1) { - warn('Inline ASCIIHexDecode image stream: ' + - 'EOD marker not found, searching for /EI/ instead.'); - stream.skip(-length); // Reset the stream position. - return this.findDefaultInlineStreamEnd(stream); - } - this.inlineStreamSkipEI(stream); - return length; - }, - /** - * Skip over the /EI/ for streams where we search for an EOD marker. - */ - inlineStreamSkipEI: function Parser_inlineStreamSkipEI(stream) { - var E = 0x45, I = 0x49; - var state = 0, ch; - while ((ch = stream.getByte()) !== -1) { - if (state === 0) { - state = (ch === E) ? 1 : 0; - } else if (state === 1) { - state = (ch === I) ? 2 : 0; - } else if (state === 2) { - break; - } - } - }, - makeInlineImage: function Parser_makeInlineImage(cipherTransform) { - var lexer = this.lexer; - var stream = lexer.stream; - - // Parse dictionary. - let dict = new Dict(this.xref), dictLength; - while (!isCmd(this.buf1, 'ID') && !isEOF(this.buf1)) { - if (!isName(this.buf1)) { - throw new FormatError('Dictionary key must be a name object'); - } - var key = this.buf1.name; + if (Number.isInteger(buf1)) { // indirect reference or integer + const num = buf1; + if (Number.isInteger(this.buf1) && isCmd(this.buf2, 'R')) { + const ref = new Ref(num, this.buf1); this.shift(); - if (isEOF(this.buf1)) { - break; - } - dict.set(key, this.getObj(cipherTransform)); - } - if (lexer.beginInlineImagePos !== -1) { - dictLength = stream.pos - lexer.beginInlineImagePos; - } - - // Extract the name of the first (i.e. the current) image filter. - var filter = dict.get('Filter', 'F'), filterName; - if (isName(filter)) { - filterName = filter.name; - } else if (Array.isArray(filter)) { - var filterZero = this.xref.fetchIfRef(filter[0]); - if (isName(filterZero)) { - filterName = filterZero.name; - } - } - - // Parse image stream. - let startPos = stream.pos, length; - if (filterName === 'DCTDecode' || filterName === 'DCT') { - length = this.findDCTDecodeInlineStreamEnd(stream); - } else if (filterName === 'ASCII85Decode' || filterName === 'A85') { - length = this.findASCII85DecodeInlineStreamEnd(stream); - } else if (filterName === 'ASCIIHexDecode' || filterName === 'AHx') { - length = this.findASCIIHexDecodeInlineStreamEnd(stream); - } else { - length = this.findDefaultInlineStreamEnd(stream); - } - var imageStream = stream.makeSubStream(startPos, length, dict); - - // Cache all images below the MAX_LENGTH_TO_CACHE threshold by their - // adler32 checksum. - let cacheKey; - if (length < MAX_LENGTH_TO_CACHE && dictLength < MAX_ADLER32_LENGTH) { - var imageBytes = imageStream.getBytes(); - imageStream.reset(); - - const initialStreamPos = stream.pos; - // Set the stream position to the beginning of the dictionary data... - stream.pos = lexer.beginInlineImagePos; - // ... and fetch the bytes of the *entire* dictionary. - let dictBytes = stream.getBytes(dictLength); - // Finally, don't forget to reset the stream position. - stream.pos = initialStreamPos; - - cacheKey = computeAdler32(imageBytes) + '_' + computeAdler32(dictBytes); - - let cacheEntry = this.imageCache[cacheKey]; - if (cacheEntry !== undefined) { - this.buf2 = Cmd.get('EI'); - this.shift(); - - cacheEntry.reset(); - return cacheEntry; - } + this.shift(); + return ref; } + return num; + } + if (isString(buf1)) { // string + let str = buf1; if (cipherTransform) { - imageStream = cipherTransform.createStream(imageStream, length); + str = cipherTransform.decryptString(str); } + return str; + } - imageStream = this.filter(imageStream, dict, length); - imageStream.dict = dict; - if (cacheKey !== undefined) { - imageStream.cacheKey = 'inline_' + length + '_' + cacheKey; - this.imageCache[cacheKey] = imageStream; - } + // simple object + return buf1; + } - this.buf2 = Cmd.get('EI'); - this.shift(); - - return imageStream; - }, - - _findStreamLength(startPos, signature) { - const { stream, } = this.lexer; - stream.pos = startPos; - - const SCAN_BLOCK_LENGTH = 2048; - const signatureLength = signature.length; - - while (stream.pos < stream.end) { - const scanBytes = stream.peekBytes(SCAN_BLOCK_LENGTH); - const scanLength = scanBytes.length - signatureLength; - - if (scanLength <= 0) { - break; - } - let pos = 0; - while (pos < scanLength) { - let j = 0; - while (j < signatureLength && scanBytes[pos + j] === signature[j]) { - j++; - } - if (j >= signatureLength) { // `signature` found. - stream.pos += pos; - return (stream.pos - startPos); - } - pos++; - } - stream.pos += scanLength; - } - return -1; - }, - - makeStream: function Parser_makeStream(dict, cipherTransform) { - var lexer = this.lexer; - var stream = lexer.stream; - - // get stream start position - lexer.skipToNextLine(); - const startPos = stream.pos - 1; - - // get length - var length = dict.get('Length'); - if (!Number.isInteger(length)) { - info('Bad ' + length + ' attribute in stream'); - length = 0; - } - - // skip over the stream data - stream.pos = startPos + length; - lexer.nextChar(); - - // Shift '>>' and check whether the new object marks the end of the stream - if (this.tryShift() && isCmd(this.buf2, 'endstream')) { - this.shift(); // 'stream' + /** + * Find the end of the stream by searching for the /EI\s/. + * @returns {number} The inline stream length. + */ + findDefaultInlineStreamEnd(stream) { + const E = 0x45, I = 0x49, SPACE = 0x20, LF = 0xA, CR = 0xD; + const n = 10, NUL = 0x0; + let startPos = stream.pos, state = 0, ch, maybeEIPos; + while ((ch = stream.getByte()) !== -1) { + if (state === 0) { + state = (ch === E) ? 1 : 0; + } else if (state === 1) { + state = (ch === I) ? 2 : 0; } else { - // Bad stream length, scanning for endstream command. - const ENDSTREAM_SIGNATURE = new Uint8Array([ - 0x65, 0x6E, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6D]); - let actualLength = this._findStreamLength(startPos, - ENDSTREAM_SIGNATURE); - if (actualLength < 0) { - // Only allow limited truncation of the endstream signature, - // to prevent false positives. - const MAX_TRUNCATION = 1; - // Check if the PDF generator included truncated endstream commands, - // such as e.g. "endstrea" (fixes issue10004.pdf). - for (let i = 1; i <= MAX_TRUNCATION; i++) { - const end = ENDSTREAM_SIGNATURE.length - i; - const TRUNCATED_SIGNATURE = ENDSTREAM_SIGNATURE.slice(0, end); - - let maybeLength = this._findStreamLength(startPos, - TRUNCATED_SIGNATURE); - if (maybeLength >= 0) { - // Ensure that the byte immediately following the truncated - // endstream command is a space, to prevent false positives. - const lastByte = stream.peekBytes(end + 1)[end]; - if (!isSpace(lastByte)) { - break; - } - info(`Found "${bytesToString(TRUNCATED_SIGNATURE)}" when ` + - 'searching for endstream command.'); - actualLength = maybeLength; + assert(state === 2); + if (ch === SPACE || ch === LF || ch === CR) { + maybeEIPos = stream.pos; + // Let's check that the next `n` bytes are ASCII... just to be sure. + const followingBytes = stream.peekBytes(n); + for (let i = 0, ii = followingBytes.length; i < ii; i++) { + ch = followingBytes[i]; + if (ch === NUL && followingBytes[i + 1] !== NUL) { + // NUL bytes are not supposed to occur *outside* of inline + // images, but some PDF generators violate that assumption, + // thus breaking the EI detection heuristics used below. + // + // However, we can't unconditionally treat NUL bytes as "ASCII", + // since that *could* result in inline images being truncated. + // + // To attempt to address this, we'll still treat any *sequence* + // of NUL bytes as non-ASCII, but for a *single* NUL byte we'll + // continue checking the `followingBytes` (fixes issue8823.pdf). + continue; + } + if (ch !== LF && ch !== CR && (ch < SPACE || ch > 0x7F)) { + // Not a LF, CR, SPACE or any visible ASCII character, i.e. + // it's binary stuff. Resetting the state. + state = 0; break; } } - - if (actualLength < 0) { - throw new FormatError('Missing endstream command.'); + if (state === 2) { + break; // Finished! } + } else { + state = 0; } - length = actualLength; + } + } - lexer.nextChar(); + if (ch === -1) { + warn('findDefaultInlineStreamEnd: ' + + 'Reached the end of the stream without finding a valid EI marker'); + if (maybeEIPos) { + warn('... trying to recover by using the last "EI" occurrence.'); + stream.skip(-(stream.pos - maybeEIPos)); // Reset the stream position. + } + } + + let endOffset = 4; + stream.skip(-endOffset); // Set the stream position to just before "EI". + ch = stream.peekByte(); + stream.skip(endOffset); // ... and remember to reset the stream position. + + // Ensure that we don't accidentally truncate the inline image, when the + // data is immediately followed by the "EI" marker (fixes issue10388.pdf). + if (!isSpace(ch)) { + endOffset--; + } + return ((stream.pos - endOffset) - startPos); + } + + /** + * Find the EOI (end-of-image) marker 0xFFD9 of the stream. + * @returns {number} The inline stream length. + */ + findDCTDecodeInlineStreamEnd(stream) { + let startPos = stream.pos, foundEOI = false, b, markerLength, length; + while ((b = stream.getByte()) !== -1) { + if (b !== 0xFF) { // Not a valid marker. + continue; + } + switch (stream.getByte()) { + case 0x00: // Byte stuffing. + // 0xFF00 appears to be a very common byte sequence in JPEG images. + break; + + case 0xFF: // Fill byte. + // Avoid skipping a valid marker, resetting the stream position. + stream.skip(-1); + break; + + case 0xD9: // EOI + foundEOI = true; + break; + + case 0xC0: // SOF0 + case 0xC1: // SOF1 + case 0xC2: // SOF2 + case 0xC3: // SOF3 + /* falls through */ + case 0xC5: // SOF5 + case 0xC6: // SOF6 + case 0xC7: // SOF7 + /* falls through */ + case 0xC9: // SOF9 + case 0xCA: // SOF10 + case 0xCB: // SOF11 + /* falls through */ + case 0xCD: // SOF13 + case 0xCE: // SOF14 + case 0xCF: // SOF15 + /* falls through */ + case 0xC4: // DHT + case 0xCC: // DAC + /* falls through */ + case 0xDA: // SOS + case 0xDB: // DQT + case 0xDC: // DNL + case 0xDD: // DRI + case 0xDE: // DHP + case 0xDF: // EXP + /* falls through */ + case 0xE0: // APP0 + case 0xE1: // APP1 + case 0xE2: // APP2 + case 0xE3: // APP3 + case 0xE4: // APP4 + case 0xE5: // APP5 + case 0xE6: // APP6 + case 0xE7: // APP7 + case 0xE8: // APP8 + case 0xE9: // APP9 + case 0xEA: // APP10 + case 0xEB: // APP11 + case 0xEC: // APP12 + case 0xED: // APP13 + case 0xEE: // APP14 + case 0xEF: // APP15 + /* falls through */ + case 0xFE: // COM + // The marker should be followed by the length of the segment. + markerLength = stream.getUint16(); + if (markerLength > 2) { + // |markerLength| contains the byte length of the marker segment, + // including its own length (2 bytes) and excluding the marker. + stream.skip(markerLength - 2); // Jump to the next marker. + } else { + // The marker length is invalid, resetting the stream position. + stream.skip(-2); + } + break; + } + if (foundEOI) { + break; + } + } + length = stream.pos - startPos; + if (b === -1) { + warn('Inline DCTDecode image stream: ' + + 'EOI marker not found, searching for /EI/ instead.'); + stream.skip(-length); // Reset the stream position. + return this.findDefaultInlineStreamEnd(stream); + } + this.inlineStreamSkipEI(stream); + return length; + } + + /** + * Find the EOD (end-of-data) marker '~>' (i.e. TILDE + GT) of the stream. + * @returns {number} The inline stream length. + */ + findASCII85DecodeInlineStreamEnd(stream) { + const TILDE = 0x7E, GT = 0x3E; + let startPos = stream.pos, ch, length; + while ((ch = stream.getByte()) !== -1) { + if (ch === TILDE) { + ch = stream.peekByte(); + // Handle corrupt PDF documents which contains whitespace "inside" of + // the EOD marker (fixes issue10614.pdf). + while (isSpace(ch)) { + stream.skip(); + ch = stream.peekByte(); + } + if (ch === GT) { + stream.skip(); + break; + } + } + } + length = stream.pos - startPos; + if (ch === -1) { + warn('Inline ASCII85Decode image stream: ' + + 'EOD marker not found, searching for /EI/ instead.'); + stream.skip(-length); // Reset the stream position. + return this.findDefaultInlineStreamEnd(stream); + } + this.inlineStreamSkipEI(stream); + return length; + } + + /** + * Find the EOD (end-of-data) marker '>' (i.e. GT) of the stream. + * @returns {number} The inline stream length. + */ + findASCIIHexDecodeInlineStreamEnd(stream) { + const GT = 0x3E; + let startPos = stream.pos, ch, length; + while ((ch = stream.getByte()) !== -1) { + if (ch === GT) { + break; + } + } + length = stream.pos - startPos; + if (ch === -1) { + warn('Inline ASCIIHexDecode image stream: ' + + 'EOD marker not found, searching for /EI/ instead.'); + stream.skip(-length); // Reset the stream position. + return this.findDefaultInlineStreamEnd(stream); + } + this.inlineStreamSkipEI(stream); + return length; + } + + /** + * Skip over the /EI/ for streams where we search for an EOD marker. + */ + inlineStreamSkipEI(stream) { + const E = 0x45, I = 0x49; + let state = 0, ch; + while ((ch = stream.getByte()) !== -1) { + if (state === 0) { + state = (ch === E) ? 1 : 0; + } else if (state === 1) { + state = (ch === I) ? 2 : 0; + } else if (state === 2) { + break; + } + } + } + + makeInlineImage(cipherTransform) { + const lexer = this.lexer; + const stream = lexer.stream; + + // Parse dictionary. + const dict = new Dict(this.xref); + let dictLength; + while (!isCmd(this.buf1, 'ID') && !isEOF(this.buf1)) { + if (!isName(this.buf1)) { + throw new FormatError('Dictionary key must be a name object'); + } + const key = this.buf1.name; + this.shift(); + if (isEOF(this.buf1)) { + break; + } + dict.set(key, this.getObj(cipherTransform)); + } + if (lexer.beginInlineImagePos !== -1) { + dictLength = stream.pos - lexer.beginInlineImagePos; + } + + // Extract the name of the first (i.e. the current) image filter. + const filter = dict.get('Filter', 'F'); + let filterName; + if (isName(filter)) { + filterName = filter.name; + } else if (Array.isArray(filter)) { + const filterZero = this.xref.fetchIfRef(filter[0]); + if (isName(filterZero)) { + filterName = filterZero.name; + } + } + + // Parse image stream. + const startPos = stream.pos; + let length; + if (filterName === 'DCTDecode' || filterName === 'DCT') { + length = this.findDCTDecodeInlineStreamEnd(stream); + } else if (filterName === 'ASCII85Decode' || filterName === 'A85') { + length = this.findASCII85DecodeInlineStreamEnd(stream); + } else if (filterName === 'ASCIIHexDecode' || filterName === 'AHx') { + length = this.findASCIIHexDecodeInlineStreamEnd(stream); + } else { + length = this.findDefaultInlineStreamEnd(stream); + } + let imageStream = stream.makeSubStream(startPos, length, dict); + + // Cache all images below the MAX_LENGTH_TO_CACHE threshold by their + // adler32 checksum. + let cacheKey; + if (length < MAX_LENGTH_TO_CACHE && dictLength < MAX_ADLER32_LENGTH) { + const imageBytes = imageStream.getBytes(); + imageStream.reset(); + + const initialStreamPos = stream.pos; + // Set the stream position to the beginning of the dictionary data... + stream.pos = lexer.beginInlineImagePos; + // ... and fetch the bytes of the *entire* dictionary. + const dictBytes = stream.getBytes(dictLength); + // Finally, don't forget to reset the stream position. + stream.pos = initialStreamPos; + + cacheKey = computeAdler32(imageBytes) + '_' + computeAdler32(dictBytes); + + const cacheEntry = this.imageCache[cacheKey]; + if (cacheEntry !== undefined) { + this.buf2 = Cmd.get('EI'); this.shift(); - this.shift(); - } - this.shift(); // 'endstream' - stream = stream.makeSubStream(startPos, length, dict); - if (cipherTransform) { - stream = cipherTransform.createStream(stream, length); - } - stream = this.filter(stream, dict, length); - stream.dict = dict; - return stream; - }, - filter: function Parser_filter(stream, dict, length) { - var filter = dict.get('Filter', 'F'); - var params = dict.get('DecodeParms', 'DP'); - if (isName(filter)) { - if (Array.isArray(params)) { - warn('/DecodeParms should not contain an Array, ' + - 'when /Filter contains a Name.'); - } - return this.makeFilter(stream, filter.name, length, params); + cacheEntry.reset(); + return cacheEntry; } + } - var maybeLength = length; - if (Array.isArray(filter)) { - var filterArray = filter; - var paramsArray = params; - for (var i = 0, ii = filterArray.length; i < ii; ++i) { - filter = this.xref.fetchIfRef(filterArray[i]); - if (!isName(filter)) { - throw new FormatError('Bad filter name: ' + filter); - } + if (cipherTransform) { + imageStream = cipherTransform.createStream(imageStream, length); + } - params = null; - if (Array.isArray(paramsArray) && (i in paramsArray)) { - params = this.xref.fetchIfRef(paramsArray[i]); - } - stream = this.makeFilter(stream, filter.name, maybeLength, params); - // after the first stream the length variable is invalid - maybeLength = null; - } + imageStream = this.filter(imageStream, dict, length); + imageStream.dict = dict; + if (cacheKey !== undefined) { + imageStream.cacheKey = `inline_${length}_${cacheKey}`; + this.imageCache[cacheKey] = imageStream; + } + + this.buf2 = Cmd.get('EI'); + this.shift(); + + return imageStream; + } + + _findStreamLength(startPos, signature) { + const { stream, } = this.lexer; + stream.pos = startPos; + + const SCAN_BLOCK_LENGTH = 2048; + const signatureLength = signature.length; + + while (stream.pos < stream.end) { + const scanBytes = stream.peekBytes(SCAN_BLOCK_LENGTH); + const scanLength = scanBytes.length - signatureLength; + + if (scanLength <= 0) { + break; } - return stream; - }, - makeFilter: function Parser_makeFilter(stream, name, maybeLength, params) { - // Since the 'Length' entry in the stream dictionary can be completely - // wrong, e.g. zero for non-empty streams, only skip parsing the stream - // when we can be absolutely certain that it actually is empty. - if (maybeLength === 0) { - warn('Empty "' + name + '" stream.'); - return new NullStream(); - } - try { - var xrefStreamStats = this.xref.stats.streamTypes; - if (name === 'FlateDecode' || name === 'Fl') { - xrefStreamStats[StreamType.FLATE] = true; - if (params) { - return new PredictorStream(new FlateStream(stream, maybeLength), - maybeLength, params); - } - return new FlateStream(stream, maybeLength); + let pos = 0; + while (pos < scanLength) { + let j = 0; + while (j < signatureLength && scanBytes[pos + j] === signature[j]) { + j++; } - if (name === 'LZWDecode' || name === 'LZW') { - xrefStreamStats[StreamType.LZW] = true; - var earlyChange = 1; - if (params) { - if (params.has('EarlyChange')) { - earlyChange = params.get('EarlyChange'); + if (j >= signatureLength) { // `signature` found. + stream.pos += pos; + return (stream.pos - startPos); + } + pos++; + } + stream.pos += scanLength; + } + return -1; + } + + makeStream(dict, cipherTransform) { + const lexer = this.lexer; + let stream = lexer.stream; + + // Get the stream's start position. + lexer.skipToNextLine(); + const startPos = stream.pos - 1; + + // Get the length. + let length = dict.get('Length'); + if (!Number.isInteger(length)) { + info(`Bad length "${length}" in stream`); + length = 0; + } + + // Skip over the stream data. + stream.pos = startPos + length; + lexer.nextChar(); + + // Shift '>>' and check whether the new object marks the end of the stream. + if (this.tryShift() && isCmd(this.buf2, 'endstream')) { + this.shift(); // 'stream' + } else { + // Bad stream length, scanning for endstream command. + const ENDSTREAM_SIGNATURE = new Uint8Array([ + 0x65, 0x6E, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6D]); + let actualLength = this._findStreamLength(startPos, + ENDSTREAM_SIGNATURE); + if (actualLength < 0) { + // Only allow limited truncation of the endstream signature, + // to prevent false positives. + const MAX_TRUNCATION = 1; + // Check if the PDF generator included truncated endstream commands, + // such as e.g. "endstrea" (fixes issue10004.pdf). + for (let i = 1; i <= MAX_TRUNCATION; i++) { + const end = ENDSTREAM_SIGNATURE.length - i; + const TRUNCATED_SIGNATURE = ENDSTREAM_SIGNATURE.slice(0, end); + + const maybeLength = this._findStreamLength(startPos, + TRUNCATED_SIGNATURE); + if (maybeLength >= 0) { + // Ensure that the byte immediately following the truncated + // endstream command is a space, to prevent false positives. + const lastByte = stream.peekBytes(end + 1)[end]; + if (!isSpace(lastByte)) { + break; } - return new PredictorStream( - new LZWStream(stream, maybeLength, earlyChange), - maybeLength, params); + info(`Found "${bytesToString(TRUNCATED_SIGNATURE)}" when ` + + 'searching for endstream command.'); + actualLength = maybeLength; + break; } - return new LZWStream(stream, maybeLength, earlyChange); } - if (name === 'DCTDecode' || name === 'DCT') { - xrefStreamStats[StreamType.DCT] = true; - return new JpegStream(stream, maybeLength, stream.dict, params); - } - if (name === 'JPXDecode' || name === 'JPX') { - xrefStreamStats[StreamType.JPX] = true; - return new JpxStream(stream, maybeLength, stream.dict, params); - } - if (name === 'ASCII85Decode' || name === 'A85') { - xrefStreamStats[StreamType.A85] = true; - return new Ascii85Stream(stream, maybeLength); - } - if (name === 'ASCIIHexDecode' || name === 'AHx') { - xrefStreamStats[StreamType.AHX] = true; - return new AsciiHexStream(stream, maybeLength); - } - if (name === 'CCITTFaxDecode' || name === 'CCF') { - xrefStreamStats[StreamType.CCF] = true; - return new CCITTFaxStream(stream, maybeLength, params); - } - if (name === 'RunLengthDecode' || name === 'RL') { - xrefStreamStats[StreamType.RL] = true; - return new RunLengthStream(stream, maybeLength); - } - if (name === 'JBIG2Decode') { - xrefStreamStats[StreamType.JBIG] = true; - return new Jbig2Stream(stream, maybeLength, stream.dict, params); - } - warn('filter "' + name + '" not supported yet'); - return stream; - } catch (ex) { - if (ex instanceof MissingDataException) { - throw ex; - } - warn('Invalid stream: \"' + ex + '\"'); - return new NullStream(); - } - }, - }; - return Parser; -})(); + if (actualLength < 0) { + throw new FormatError('Missing endstream command.'); + } + } + length = actualLength; + + lexer.nextChar(); + this.shift(); + this.shift(); + } + this.shift(); // 'endstream' + + stream = stream.makeSubStream(startPos, length, dict); + if (cipherTransform) { + stream = cipherTransform.createStream(stream, length); + } + stream = this.filter(stream, dict, length); + stream.dict = dict; + return stream; + } + + filter(stream, dict, length) { + let filter = dict.get('Filter', 'F'); + let params = dict.get('DecodeParms', 'DP'); + + if (isName(filter)) { + if (Array.isArray(params)) { + warn('/DecodeParms should not contain an Array, ' + + 'when /Filter contains a Name.'); + } + return this.makeFilter(stream, filter.name, length, params); + } + + let maybeLength = length; + if (Array.isArray(filter)) { + let filterArray = filter; + let paramsArray = params; + for (let i = 0, ii = filterArray.length; i < ii; ++i) { + filter = this.xref.fetchIfRef(filterArray[i]); + if (!isName(filter)) { + throw new FormatError(`Bad filter name "${filter}"`); + } + + params = null; + if (Array.isArray(paramsArray) && (i in paramsArray)) { + params = this.xref.fetchIfRef(paramsArray[i]); + } + stream = this.makeFilter(stream, filter.name, maybeLength, params); + // After the first stream the `length` variable is invalid. + maybeLength = null; + } + } + return stream; + } + + makeFilter(stream, name, maybeLength, params) { + // Since the 'Length' entry in the stream dictionary can be completely + // wrong, e.g. zero for non-empty streams, only skip parsing the stream + // when we can be absolutely certain that it actually is empty. + if (maybeLength === 0) { + warn(`Empty "${name}" stream.`); + return new NullStream(); + } + + try { + const xrefStreamStats = this.xref.stats.streamTypes; + if (name === 'FlateDecode' || name === 'Fl') { + xrefStreamStats[StreamType.FLATE] = true; + if (params) { + return new PredictorStream(new FlateStream(stream, maybeLength), + maybeLength, params); + } + return new FlateStream(stream, maybeLength); + } + if (name === 'LZWDecode' || name === 'LZW') { + xrefStreamStats[StreamType.LZW] = true; + let earlyChange = 1; + if (params) { + if (params.has('EarlyChange')) { + earlyChange = params.get('EarlyChange'); + } + return new PredictorStream( + new LZWStream(stream, maybeLength, earlyChange), + maybeLength, params); + } + return new LZWStream(stream, maybeLength, earlyChange); + } + if (name === 'DCTDecode' || name === 'DCT') { + xrefStreamStats[StreamType.DCT] = true; + return new JpegStream(stream, maybeLength, stream.dict, params); + } + if (name === 'JPXDecode' || name === 'JPX') { + xrefStreamStats[StreamType.JPX] = true; + return new JpxStream(stream, maybeLength, stream.dict, params); + } + if (name === 'ASCII85Decode' || name === 'A85') { + xrefStreamStats[StreamType.A85] = true; + return new Ascii85Stream(stream, maybeLength); + } + if (name === 'ASCIIHexDecode' || name === 'AHx') { + xrefStreamStats[StreamType.AHX] = true; + return new AsciiHexStream(stream, maybeLength); + } + if (name === 'CCITTFaxDecode' || name === 'CCF') { + xrefStreamStats[StreamType.CCF] = true; + return new CCITTFaxStream(stream, maybeLength, params); + } + if (name === 'RunLengthDecode' || name === 'RL') { + xrefStreamStats[StreamType.RL] = true; + return new RunLengthStream(stream, maybeLength); + } + if (name === 'JBIG2Decode') { + xrefStreamStats[StreamType.JBIG] = true; + return new Jbig2Stream(stream, maybeLength, stream.dict, params); + } + warn(`Filter "${name}" is not supported.`); + return stream; + } catch (ex) { + if (ex instanceof MissingDataException) { + throw ex; + } + warn(`Invalid stream: "${ex}"`); + return new NullStream(); + } + } +} var Lexer = (function LexerClosure() { function Lexer(stream, knownCommands) { From 8d4d7dbf58e79476fee127225f5908e8ef0fccd9 Mon Sep 17 00:00:00 2001 From: Tim van der Meij Date: Sun, 10 Mar 2019 14:27:26 +0100 Subject: [PATCH 2/5] Convert the `Lexer` class in `src/core/parser.js` to ES6 syntax --- src/core/parser.js | 814 +++++++++++++++++++++++---------------------- 1 file changed, 409 insertions(+), 405 deletions(-) diff --git a/src/core/parser.js b/src/core/parser.js index f41951e08..b29bcb0e2 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -714,8 +714,40 @@ class Parser { } } -var Lexer = (function LexerClosure() { - function Lexer(stream, knownCommands) { +// A '1' in this array means the character is white space. A '1' or +// '2' means the character ends a name or command. +const specialChars = [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx +]; + +function toHexDigit(ch) { + if (ch >= 0x30 && ch <= 0x39) { // '0'-'9' + return ch & 0x0F; + } + if ((ch >= 0x41 && ch <= 0x46) || (ch >= 0x61 && ch <= 0x66)) { + // 'A'-'F', 'a'-'f' + return (ch & 0x0F) + 9; + } + return -1; +} + +class Lexer { + constructor(stream, knownCommands) { this.stream = stream; this.nextChar(); @@ -738,435 +770,407 @@ var Lexer = (function LexerClosure() { this.beginInlineImagePos = -1; } - // A '1' in this array means the character is white space. A '1' or - // '2' means the character ends a name or command. - var specialChars = [ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x - 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx - ]; - - function toHexDigit(ch) { - if (ch >= 0x30 && ch <= 0x39) { // '0'-'9' - return ch & 0x0F; - } - if ((ch >= 0x41 && ch <= 0x46) || (ch >= 0x61 && ch <= 0x66)) { - // 'A'-'F', 'a'-'f' - return (ch & 0x0F) + 9; - } - return -1; + nextChar() { + return (this.currentChar = this.stream.getByte()); } - Lexer.prototype = { - nextChar: function Lexer_nextChar() { - return (this.currentChar = this.stream.getByte()); - }, - peekChar: function Lexer_peekChar() { - return this.stream.peekByte(); - }, - getNumber: function Lexer_getNumber() { - var ch = this.currentChar; - var eNotation = false; - var divideBy = 0; // different from 0 if it's a floating point value - var sign = 0; + peekChar() { + return this.stream.peekByte(); + } + + getNumber() { + let ch = this.currentChar; + let eNotation = false; + let divideBy = 0; // Different from 0 if it's a floating point value. + let sign = 0; + + if (ch === 0x2D) { // '-' + sign = -1; + ch = this.nextChar(); if (ch === 0x2D) { // '-' - sign = -1; - ch = this.nextChar(); - - if (ch === 0x2D) { // '-' - // Ignore double negative (this is consistent with Adobe Reader). - ch = this.nextChar(); - } - } else if (ch === 0x2B) { // '+' - sign = 1; + // Ignore double negative (this is consistent with Adobe Reader). ch = this.nextChar(); } - if (ch === 0x0A || ch === 0x0D) { // LF, CR - // Ignore line-breaks (this is consistent with Adobe Reader). - do { - ch = this.nextChar(); - } while (ch === 0x0A || ch === 0x0D); - } - if (ch === 0x2E) { // '.' - divideBy = 10; + } else if (ch === 0x2B) { // '+' + sign = 1; + ch = this.nextChar(); + } + if (ch === 0x0A || ch === 0x0D) { // LF, CR + // Ignore line-breaks (this is consistent with Adobe Reader). + do { ch = this.nextChar(); + } while (ch === 0x0A || ch === 0x0D); + } + if (ch === 0x2E) { // '.' + divideBy = 10; + ch = this.nextChar(); + } + if (ch < 0x30 || ch > 0x39) { // '0' - '9' + if (divideBy === 10 && sign === 0 && + (isSpace(ch) || ch === /* EOF = */ -1)) { + // This is consistent with Adobe Reader (fixes issue9252.pdf). + warn('Lexer.getNumber - treating a single decimal point as zero.'); + return 0; } - if (ch < 0x30 || ch > 0x39) { // '0' - '9' - if (divideBy === 10 && sign === 0 && - (isSpace(ch) || ch === /* EOF = */ -1)) { - // This is consistent with Adobe Reader (fixes issue9252.pdf). - warn('Lexer.getNumber - treating a single decimal point as zero.'); - return 0; - } - throw new FormatError( - `Invalid number: ${String.fromCharCode(ch)} (charCode ${ch})`); - } + throw new FormatError( + `Invalid number: ${String.fromCharCode(ch)} (charCode ${ch})`); + } - sign = sign || 1; - var baseValue = ch - 0x30; // '0' - var powerValue = 0; - var powerValueSign = 1; + sign = sign || 1; + let baseValue = ch - 0x30; // '0' + let powerValue = 0; + let powerValueSign = 1; - while ((ch = this.nextChar()) >= 0) { - if (0x30 <= ch && ch <= 0x39) { // '0' - '9' - var currentDigit = ch - 0x30; // '0' - if (eNotation) { // We are after an 'e' or 'E' - powerValue = powerValue * 10 + currentDigit; - } else { - if (divideBy !== 0) { // We are after a point - divideBy *= 10; - } - baseValue = baseValue * 10 + currentDigit; - } - } else if (ch === 0x2E) { // '.' - if (divideBy === 0) { - divideBy = 1; - } else { - // A number can have only one '.' - break; - } - } else if (ch === 0x2D) { // '-' - // ignore minus signs in the middle of numbers to match - // Adobe's behavior - warn('Badly formatted number'); - } else if (ch === 0x45 || ch === 0x65) { // 'E', 'e' - // 'E' can be either a scientific notation or the beginning of a new - // operator - ch = this.peekChar(); - if (ch === 0x2B || ch === 0x2D) { // '+', '-' - powerValueSign = (ch === 0x2D) ? -1 : 1; - this.nextChar(); // Consume the sign character - } else if (ch < 0x30 || ch > 0x39) { // '0' - '9' - // The 'E' must be the beginning of a new operator - break; - } - eNotation = true; + while ((ch = this.nextChar()) >= 0) { + if (0x30 <= ch && ch <= 0x39) { // '0' - '9' + const currentDigit = ch - 0x30; // '0' + if (eNotation) { // We are after an 'e' or 'E'. + powerValue = powerValue * 10 + currentDigit; } else { - // the last character doesn't belong to us - break; - } - } - - if (divideBy !== 0) { - baseValue /= divideBy; - } - if (eNotation) { - baseValue *= Math.pow(10, powerValueSign * powerValue); - } - return sign * baseValue; - }, - getString: function Lexer_getString() { - var numParen = 1; - var done = false; - var strBuf = this.strBuf; - strBuf.length = 0; - - var ch = this.nextChar(); - while (true) { - var charBuffered = false; - switch (ch | 0) { - case -1: - warn('Unterminated string'); - done = true; - break; - case 0x28: // '(' - ++numParen; - strBuf.push('('); - break; - case 0x29: // ')' - if (--numParen === 0) { - this.nextChar(); // consume strings ')' - done = true; - } else { - strBuf.push(')'); - } - break; - case 0x5C: // '\\' - ch = this.nextChar(); - switch (ch) { - case -1: - warn('Unterminated string'); - done = true; - break; - case 0x6E: // 'n' - strBuf.push('\n'); - break; - case 0x72: // 'r' - strBuf.push('\r'); - break; - case 0x74: // 't' - strBuf.push('\t'); - break; - case 0x62: // 'b' - strBuf.push('\b'); - break; - case 0x66: // 'f' - strBuf.push('\f'); - break; - case 0x5C: // '\' - case 0x28: // '(' - case 0x29: // ')' - strBuf.push(String.fromCharCode(ch)); - break; - case 0x30: case 0x31: case 0x32: case 0x33: // '0'-'3' - case 0x34: case 0x35: case 0x36: case 0x37: // '4'-'7' - var x = ch & 0x0F; - ch = this.nextChar(); - charBuffered = true; - if (ch >= 0x30 && ch <= 0x37) { // '0'-'7' - x = (x << 3) + (ch & 0x0F); - ch = this.nextChar(); - if (ch >= 0x30 && ch <= 0x37) { // '0'-'7' - charBuffered = false; - x = (x << 3) + (ch & 0x0F); - } - } - strBuf.push(String.fromCharCode(x)); - break; - case 0x0D: // CR - if (this.peekChar() === 0x0A) { // LF - this.nextChar(); - } - break; - case 0x0A: // LF - break; - default: - strBuf.push(String.fromCharCode(ch)); - break; - } - break; - default: - strBuf.push(String.fromCharCode(ch)); - break; - } - if (done) { - break; - } - if (!charBuffered) { - ch = this.nextChar(); - } - } - return strBuf.join(''); - }, - getName: function Lexer_getName() { - var ch, previousCh; - var strBuf = this.strBuf; - strBuf.length = 0; - while ((ch = this.nextChar()) >= 0 && !specialChars[ch]) { - if (ch === 0x23) { // '#' - ch = this.nextChar(); - if (specialChars[ch]) { - warn('Lexer_getName: ' + - 'NUMBER SIGN (#) should be followed by a hexadecimal number.'); - strBuf.push('#'); - break; - } - var x = toHexDigit(ch); - if (x !== -1) { - previousCh = ch; - ch = this.nextChar(); - var x2 = toHexDigit(ch); - if (x2 === -1) { - warn('Lexer_getName: Illegal digit (' + - String.fromCharCode(ch) + ') in hexadecimal number.'); - strBuf.push('#', String.fromCharCode(previousCh)); - if (specialChars[ch]) { - break; - } - strBuf.push(String.fromCharCode(ch)); - continue; - } - strBuf.push(String.fromCharCode((x << 4) | x2)); - } else { - strBuf.push('#', String.fromCharCode(ch)); + if (divideBy !== 0) { // We are after a point. + divideBy *= 10; } + baseValue = baseValue * 10 + currentDigit; + } + } else if (ch === 0x2E) { // '.' + if (divideBy === 0) { + divideBy = 1; } else { - strBuf.push(String.fromCharCode(ch)); - } - } - if (strBuf.length > 127) { - warn('name token is longer than allowed by the spec: ' + strBuf.length); - } - return Name.get(strBuf.join('')); - }, - getHexString: function Lexer_getHexString() { - var strBuf = this.strBuf; - strBuf.length = 0; - var ch = this.currentChar; - var isFirstHex = true; - var firstDigit; - var secondDigit; - while (true) { - if (ch < 0) { - warn('Unterminated hex string'); - break; - } else if (ch === 0x3E) { // '>' - this.nextChar(); - break; - } else if (specialChars[ch] === 1) { - ch = this.nextChar(); - continue; - } else { - if (isFirstHex) { - firstDigit = toHexDigit(ch); - if (firstDigit === -1) { - warn('Ignoring invalid character "' + ch + '" in hex string'); - ch = this.nextChar(); - continue; - } - } else { - secondDigit = toHexDigit(ch); - if (secondDigit === -1) { - warn('Ignoring invalid character "' + ch + '" in hex string'); - ch = this.nextChar(); - continue; - } - strBuf.push(String.fromCharCode((firstDigit << 4) | secondDigit)); - } - isFirstHex = !isFirstHex; - ch = this.nextChar(); - } - } - return strBuf.join(''); - }, - getObj: function Lexer_getObj() { - // skip whitespace and comments - var comment = false; - var ch = this.currentChar; - while (true) { - if (ch < 0) { - return EOF; - } - if (comment) { - if (ch === 0x0A || ch === 0x0D) { // LF, CR - comment = false; - } - } else if (ch === 0x25) { // '%' - comment = true; - } else if (specialChars[ch] !== 1) { + // A number can have only one dot. break; } - ch = this.nextChar(); + } else if (ch === 0x2D) { // '-' + // Ignore minus signs in the middle of numbers to match + // Adobe's behavior. + warn('Badly formatted number: minus sign in the middle'); + } else if (ch === 0x45 || ch === 0x65) { // 'E', 'e' + // 'E' can be either a scientific notation or the beginning of a new + // operator. + ch = this.peekChar(); + if (ch === 0x2B || ch === 0x2D) { // '+', '-' + powerValueSign = (ch === 0x2D) ? -1 : 1; + this.nextChar(); // Consume the sign character. + } else if (ch < 0x30 || ch > 0x39) { // '0' - '9' + // The 'E' must be the beginning of a new operator. + break; + } + eNotation = true; + } else { + // The last character doesn't belong to us. + break; } + } - // start reading token + if (divideBy !== 0) { + baseValue /= divideBy; + } + if (eNotation) { + baseValue *= Math.pow(10, powerValueSign * powerValue); + } + return sign * baseValue; + } + + getString() { + let numParen = 1; + let done = false; + const strBuf = this.strBuf; + strBuf.length = 0; + + let ch = this.nextChar(); + while (true) { + let charBuffered = false; switch (ch | 0) { - case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: // '0'-'4' - case 0x35: case 0x36: case 0x37: case 0x38: case 0x39: // '5'-'9' - case 0x2B: case 0x2D: case 0x2E: // '+', '-', '.' - return this.getNumber(); + case -1: + warn('Unterminated string'); + done = true; + break; case 0x28: // '(' - return this.getString(); - case 0x2F: // '/' - return this.getName(); - // array punctuation - case 0x5B: // '[' - this.nextChar(); - return Cmd.get('['); - case 0x5D: // ']' - this.nextChar(); - return Cmd.get(']'); - // hex string or dict punctuation - case 0x3C: // '<' - ch = this.nextChar(); - if (ch === 0x3C) { - // dict punctuation - this.nextChar(); - return Cmd.get('<<'); - } - return this.getHexString(); - // dict punctuation - case 0x3E: // '>' - ch = this.nextChar(); - if (ch === 0x3E) { - this.nextChar(); - return Cmd.get('>>'); - } - return Cmd.get('>'); - case 0x7B: // '{' - this.nextChar(); - return Cmd.get('{'); - case 0x7D: // '}' - this.nextChar(); - return Cmd.get('}'); + ++numParen; + strBuf.push('('); + break; case 0x29: // ')' - // Consume the current character in order to avoid permanently hanging - // the worker thread if `Lexer.getObject` is called from within a loop - // containing try-catch statements, since we would otherwise attempt - // to parse the *same* character over and over (fixes issue8061.pdf). - this.nextChar(); - throw new FormatError(`Illegal character: ${ch}`); - } - - // command - var str = String.fromCharCode(ch); - var knownCommands = this.knownCommands; - var knownCommandFound = knownCommands && knownCommands[str] !== undefined; - while ((ch = this.nextChar()) >= 0 && !specialChars[ch]) { - // stop if known command is found and next character does not make - // the str a command - var possibleCommand = str + String.fromCharCode(ch); - if (knownCommandFound && knownCommands[possibleCommand] === undefined) { - break; - } - if (str.length === 128) { - throw new FormatError(`Command token too long: ${str.length}`); - } - str = possibleCommand; - knownCommandFound = knownCommands && knownCommands[str] !== undefined; - } - if (str === 'true') { - return true; - } - if (str === 'false') { - return false; - } - if (str === 'null') { - return null; - } - - if (str === 'BI') { - // Keep track of the current stream position, since it's needed in order - // to correctly cache inline images; see `Parser.makeInlineImage`. - this.beginInlineImagePos = this.stream.pos; - } - - return Cmd.get(str); - }, - skipToNextLine: function Lexer_skipToNextLine() { - var ch = this.currentChar; - while (ch >= 0) { - if (ch === 0x0D) { // CR - ch = this.nextChar(); - if (ch === 0x0A) { // LF - this.nextChar(); + if (--numParen === 0) { + this.nextChar(); // consume strings ')' + done = true; + } else { + strBuf.push(')'); } break; - } else if (ch === 0x0A) { // LF - this.nextChar(); + case 0x5C: // '\\' + ch = this.nextChar(); + switch (ch) { + case -1: + warn('Unterminated string'); + done = true; + break; + case 0x6E: // 'n' + strBuf.push('\n'); + break; + case 0x72: // 'r' + strBuf.push('\r'); + break; + case 0x74: // 't' + strBuf.push('\t'); + break; + case 0x62: // 'b' + strBuf.push('\b'); + break; + case 0x66: // 'f' + strBuf.push('\f'); + break; + case 0x5C: // '\' + case 0x28: // '(' + case 0x29: // ')' + strBuf.push(String.fromCharCode(ch)); + break; + case 0x30: case 0x31: case 0x32: case 0x33: // '0'-'3' + case 0x34: case 0x35: case 0x36: case 0x37: // '4'-'7' + let x = ch & 0x0F; + ch = this.nextChar(); + charBuffered = true; + if (ch >= 0x30 && ch <= 0x37) { // '0'-'7' + x = (x << 3) + (ch & 0x0F); + ch = this.nextChar(); + if (ch >= 0x30 && ch <= 0x37) { // '0'-'7' + charBuffered = false; + x = (x << 3) + (ch & 0x0F); + } + } + strBuf.push(String.fromCharCode(x)); + break; + case 0x0D: // CR + if (this.peekChar() === 0x0A) { // LF + this.nextChar(); + } + break; + case 0x0A: // LF + break; + default: + strBuf.push(String.fromCharCode(ch)); + break; + } break; - } + default: + strBuf.push(String.fromCharCode(ch)); + break; + } + if (done) { + break; + } + if (!charBuffered) { ch = this.nextChar(); } - }, - }; + } + return strBuf.join(''); + } - return Lexer; -})(); + getName() { + let ch, previousCh; + const strBuf = this.strBuf; + strBuf.length = 0; + + while ((ch = this.nextChar()) >= 0 && !specialChars[ch]) { + if (ch === 0x23) { // '#' + ch = this.nextChar(); + if (specialChars[ch]) { + warn('Lexer_getName: ' + + 'NUMBER SIGN (#) should be followed by a hexadecimal number.'); + strBuf.push('#'); + break; + } + const x = toHexDigit(ch); + if (x !== -1) { + previousCh = ch; + ch = this.nextChar(); + const x2 = toHexDigit(ch); + if (x2 === -1) { + warn(`Lexer_getName: Illegal digit (${String.fromCharCode(ch)}) ` + + 'in hexadecimal number.'); + strBuf.push('#', String.fromCharCode(previousCh)); + if (specialChars[ch]) { + break; + } + strBuf.push(String.fromCharCode(ch)); + continue; + } + strBuf.push(String.fromCharCode((x << 4) | x2)); + } else { + strBuf.push('#', String.fromCharCode(ch)); + } + } else { + strBuf.push(String.fromCharCode(ch)); + } + } + if (strBuf.length > 127) { + warn(`Name token is longer than allowed by the spec: ${strBuf.length}`); + } + return Name.get(strBuf.join('')); + } + + getHexString() { + const strBuf = this.strBuf; + strBuf.length = 0; + let ch = this.currentChar; + let isFirstHex = true; + let firstDigit, secondDigit; + + while (true) { + if (ch < 0) { + warn('Unterminated hex string'); + break; + } else if (ch === 0x3E) { // '>' + this.nextChar(); + break; + } else if (specialChars[ch] === 1) { + ch = this.nextChar(); + continue; + } else { + if (isFirstHex) { + firstDigit = toHexDigit(ch); + if (firstDigit === -1) { + warn(`Ignoring invalid character "${ch}" in hex string`); + ch = this.nextChar(); + continue; + } + } else { + secondDigit = toHexDigit(ch); + if (secondDigit === -1) { + warn(`Ignoring invalid character "${ch}" in hex string`); + ch = this.nextChar(); + continue; + } + strBuf.push(String.fromCharCode((firstDigit << 4) | secondDigit)); + } + isFirstHex = !isFirstHex; + ch = this.nextChar(); + } + } + return strBuf.join(''); + } + + getObj() { + // Skip whitespace and comments. + let comment = false; + let ch = this.currentChar; + while (true) { + if (ch < 0) { + return EOF; + } + if (comment) { + if (ch === 0x0A || ch === 0x0D) { // LF, CR + comment = false; + } + } else if (ch === 0x25) { // '%' + comment = true; + } else if (specialChars[ch] !== 1) { + break; + } + ch = this.nextChar(); + } + + // Start reading a token. + switch (ch | 0) { + case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: // '0'-'4' + case 0x35: case 0x36: case 0x37: case 0x38: case 0x39: // '5'-'9' + case 0x2B: case 0x2D: case 0x2E: // '+', '-', '.' + return this.getNumber(); + case 0x28: // '(' + return this.getString(); + case 0x2F: // '/' + return this.getName(); + // array punctuation + case 0x5B: // '[' + this.nextChar(); + return Cmd.get('['); + case 0x5D: // ']' + this.nextChar(); + return Cmd.get(']'); + // hex string or dict punctuation + case 0x3C: // '<' + ch = this.nextChar(); + if (ch === 0x3C) { + // dict punctuation + this.nextChar(); + return Cmd.get('<<'); + } + return this.getHexString(); + // dict punctuation + case 0x3E: // '>' + ch = this.nextChar(); + if (ch === 0x3E) { + this.nextChar(); + return Cmd.get('>>'); + } + return Cmd.get('>'); + case 0x7B: // '{' + this.nextChar(); + return Cmd.get('{'); + case 0x7D: // '}' + this.nextChar(); + return Cmd.get('}'); + case 0x29: // ')' + // Consume the current character in order to avoid permanently hanging + // the worker thread if `Lexer.getObject` is called from within a loop + // containing try-catch statements, since we would otherwise attempt + // to parse the *same* character over and over (fixes issue8061.pdf). + this.nextChar(); + throw new FormatError(`Illegal character: ${ch}`); + } + + // Start reading a command. + let str = String.fromCharCode(ch); + const knownCommands = this.knownCommands; + let knownCommandFound = knownCommands && knownCommands[str] !== undefined; + while ((ch = this.nextChar()) >= 0 && !specialChars[ch]) { + // Stop if a known command is found and next character does not make + // the string a command. + const possibleCommand = str + String.fromCharCode(ch); + if (knownCommandFound && knownCommands[possibleCommand] === undefined) { + break; + } + if (str.length === 128) { + throw new FormatError(`Command token too long: ${str.length}`); + } + str = possibleCommand; + knownCommandFound = knownCommands && knownCommands[str] !== undefined; + } + if (str === 'true') { + return true; + } + if (str === 'false') { + return false; + } + if (str === 'null') { + return null; + } + + if (str === 'BI') { + // Keep track of the current stream position, since it's needed in order + // to correctly cache inline images; see `Parser.makeInlineImage`. + this.beginInlineImagePos = this.stream.pos; + } + + return Cmd.get(str); + } + + skipToNextLine() { + let ch = this.currentChar; + while (ch >= 0) { + if (ch === 0x0D) { // CR + ch = this.nextChar(); + if (ch === 0x0A) { // LF + this.nextChar(); + } + break; + } else if (ch === 0x0A) { // LF + this.nextChar(); + break; + } + ch = this.nextChar(); + } + } +} var Linearization = { create: function LinearizationCreate(stream) { From 7d3cb1957127b7fc7220bee02990f85de4a58f4f Mon Sep 17 00:00:00 2001 From: Tim van der Meij Date: Sun, 10 Mar 2019 14:46:06 +0100 Subject: [PATCH 3/5] Convert the `Linearization` class in `src/core/parser.js` to ES6 syntax Moreover, disable `var` usage for this file. --- src/core/parser.js | 59 ++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/src/core/parser.js b/src/core/parser.js index b29bcb0e2..21da1475b 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -12,6 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +/* eslint no-var: error */ import { Ascii85Stream, AsciiHexStream, FlateStream, LZWStream, NullStream, @@ -1172,55 +1173,61 @@ class Lexer { } } -var Linearization = { - create: function LinearizationCreate(stream) { - function getInt(name, allowZeroValue) { - var obj = linDict.get(name); +class Linearization { + static create(stream) { + function getInt(linDict, name, allowZeroValue = false) { + const obj = linDict.get(name); if (Number.isInteger(obj) && (allowZeroValue ? obj >= 0 : obj > 0)) { return obj; } - throw new Error('The "' + name + '" parameter in the linearization ' + + throw new Error(`The "${name}" parameter in the linearization ` + 'dictionary is invalid.'); } - function getHints() { - var hints = linDict.get('H'), hintsLength, item; + + function getHints(linDict) { + const hints = linDict.get('H'); + let hintsLength; + if (Array.isArray(hints) && ((hintsLength = hints.length) === 2 || hintsLength === 4)) { - for (var index = 0; index < hintsLength; index++) { - if (!(Number.isInteger(item = hints[index]) && item > 0)) { - throw new Error('Hint (' + index + - ') in the linearization dictionary is invalid.'); + for (let index = 0; index < hintsLength; index++) { + const hint = hints[index]; + if (!(Number.isInteger(hint) && hint > 0)) { + throw new Error(`Hint (${index}) in the linearization dictionary ` + + 'is invalid.'); } } return hints; } throw new Error('Hint array in the linearization dictionary is invalid.'); } - var parser = new Parser(new Lexer(stream), false, null); - var obj1 = parser.getObj(); - var obj2 = parser.getObj(); - var obj3 = parser.getObj(); - var linDict = parser.getObj(); - var obj, length; + + const parser = new Parser(new Lexer(stream), false, null); + const obj1 = parser.getObj(); + const obj2 = parser.getObj(); + const obj3 = parser.getObj(); + const linDict = parser.getObj(); + let obj, length; if (!(Number.isInteger(obj1) && Number.isInteger(obj2) && isCmd(obj3, 'obj') && isDict(linDict) && isNum(obj = linDict.get('Linearized')) && obj > 0)) { return null; // No valid linearization dictionary found. - } else if ((length = getInt('L')) !== stream.length) { + } else if ((length = getInt(linDict, 'L')) !== stream.length) { throw new Error('The "L" parameter in the linearization dictionary ' + 'does not equal the stream length.'); } return { length, - hints: getHints(), - objectNumberFirst: getInt('O'), - endFirst: getInt('E'), - numPages: getInt('N'), - mainXRefEntriesOffset: getInt('T'), - pageFirst: (linDict.has('P') ? getInt('P', true) : 0), + hints: getHints(linDict), + objectNumberFirst: getInt(linDict, 'O'), + endFirst: getInt(linDict, 'E'), + numPages: getInt(linDict, 'N'), + mainXRefEntriesOffset: getInt(linDict, 'T'), + pageFirst: (linDict.has('P') ? + getInt(linDict, 'P', /* allowZeroValue = */ true) : 0), }; - }, -}; + } +} export { Lexer, From 2ee299a62b698fc1d5e6a2568526cbac6de4d938 Mon Sep 17 00:00:00 2001 From: Tim van der Meij Date: Sun, 10 Mar 2019 15:29:24 +0100 Subject: [PATCH 4/5] Convert `test/unit/parser_spec.js` to ES6 syntax Moreover, disable `var` usage for this file. --- test/unit/parser_spec.js | 161 +++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 93 deletions(-) diff --git a/test/unit/parser_spec.js b/test/unit/parser_spec.js index 7c10ba253..0dde4385c 100644 --- a/test/unit/parser_spec.js +++ b/test/unit/parser_spec.js @@ -12,6 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +/* eslint no-var: error */ import { Lexer, Linearization } from '../../src/core/parser'; import { FormatError } from '../../src/shared/util'; @@ -21,65 +22,53 @@ import { StringStream } from '../../src/core/stream'; describe('parser', function() { describe('Lexer', function() { it('should stop parsing numbers at the end of stream', function() { - var input = new StringStream('11.234'); - var lexer = new Lexer(input); - var result = lexer.getNumber(); - - expect(result).toEqual(11.234); + const input = new StringStream('11.234'); + const lexer = new Lexer(input); + expect(lexer.getNumber()).toEqual(11.234); }); it('should parse PostScript numbers', function() { - var numbers = ['-.002', '34.5', '-3.62', '123.6e10', '1E-5', '-1.', '0.0', - '123', '-98', '43445', '0', '+17']; - for (var i = 0, ii = numbers.length; i < ii; i++) { - var num = numbers[i]; - var input = new StringStream(num); - var lexer = new Lexer(input); - var result = lexer.getNumber(); - - expect(result).toEqual(parseFloat(num)); + const numbers = ['-.002', '34.5', '-3.62', '123.6e10', '1E-5', '-1.', + '0.0', '123', '-98', '43445', '0', '+17']; + for (const number of numbers) { + const input = new StringStream(number); + const lexer = new Lexer(input); + expect(lexer.getNumber()).toEqual(parseFloat(number)); } }); it('should ignore double negative before number', function() { - var input = new StringStream('--205.88'); - var lexer = new Lexer(input); - var result = lexer.getNumber(); - - expect(result).toEqual(-205.88); + const input = new StringStream('--205.88'); + const lexer = new Lexer(input); + expect(lexer.getNumber()).toEqual(-205.88); }); it('should ignore minus signs in the middle of number', function() { - var input = new StringStream('205--.88'); - var lexer = new Lexer(input); - var result = lexer.getNumber(); - - expect(result).toEqual(205.88); + const input = new StringStream('205--.88'); + const lexer = new Lexer(input); + expect(lexer.getNumber()).toEqual(205.88); }); it('should ignore line-breaks between operator and digit in number', function() { - let minusInput = new StringStream('-\r\n205.88'); - let minusLexer = new Lexer(minusInput); - + const minusInput = new StringStream('-\r\n205.88'); + const minusLexer = new Lexer(minusInput); expect(minusLexer.getNumber()).toEqual(-205.88); - let plusInput = new StringStream('+\r\n205.88'); - let plusLexer = new Lexer(plusInput); - + const plusInput = new StringStream('+\r\n205.88'); + const plusLexer = new Lexer(plusInput); expect(plusLexer.getNumber()).toEqual(205.88); }); it('should treat a single decimal point as zero', function() { - let input = new StringStream('.'); - let lexer = new Lexer(input); - + const input = new StringStream('.'); + const lexer = new Lexer(input); expect(lexer.getNumber()).toEqual(0); - let numbers = ['..', '-.', '+.', '-\r\n.', '+\r\n.']; - for (let number of numbers) { - let input = new StringStream(number); - let lexer = new Lexer(input); + const numbers = ['..', '-.', '+.', '-\r\n.', '+\r\n.']; + for (const number of numbers) { + const input = new StringStream(number); + const lexer = new Lexer(input); expect(function() { return lexer.getNumber(); @@ -88,68 +77,54 @@ describe('parser', function() { }); it('should handle glued numbers and operators', function() { - var input = new StringStream('123ET'); - var lexer = new Lexer(input); - var value = lexer.getNumber(); - - expect(value).toEqual(123); + const input = new StringStream('123ET'); + const lexer = new Lexer(input); + expect(lexer.getNumber()).toEqual(123); // The lexer must not have consumed the 'E' expect(lexer.currentChar).toEqual(0x45); // 'E' }); it('should stop parsing strings at the end of stream', function() { - var input = new StringStream('(1$4)'); + const input = new StringStream('(1$4)'); input.getByte = function(super_getByte) { - // simulating end of file using null (see issue 2766) - var ch = super_getByte.call(input); + // Simulating end of file using null (see issue 2766). + const ch = super_getByte.call(input); return (ch === 0x24 /* '$' */ ? -1 : ch); }.bind(input, input.getByte); - var lexer = new Lexer(input); - var result = lexer.getString(); - - expect(result).toEqual('1'); + const lexer = new Lexer(input); + expect(lexer.getString()).toEqual('1'); }); it('should not throw exception on bad input', function() { - // '8 0 2 15 5 2 2 2 4 3 2 4' - // should be parsed as - // '80 21 55 22 24 32' - var input = new StringStream('<7 0 2 15 5 2 2 2 4 3 2 4>'); - var lexer = new Lexer(input); - var result = lexer.getHexString(); - - expect(result).toEqual('p!U"$2'); + // '7 0 2 15 5 2 2 2 4 3 2 4' should be parsed as '70 21 55 22 24 32'. + const input = new StringStream('<7 0 2 15 5 2 2 2 4 3 2 4>'); + const lexer = new Lexer(input); + expect(lexer.getHexString()).toEqual('p!U"$2'); }); it('should ignore escaped CR and LF', function() { - // '(\101\\102)' - // should be parsed as - // "AB" - var input = new StringStream('(\\101\\\r\n\\102\\\r\\103\\\n\\104)'); - var lexer = new Lexer(input); - var result = lexer.getString(); - - expect(result).toEqual('ABCD'); + // '(\101\\102)' should be parsed as 'AB'. + const input = new StringStream('(\\101\\\r\n\\102\\\r\\103\\\n\\104)'); + const lexer = new Lexer(input); + expect(lexer.getString()).toEqual('ABCD'); }); it('should handle Names with invalid usage of NUMBER SIGN (#)', function() { - var inputNames = ['/# 680 0 R', '/#AQwerty', '/#A<>\n' + 'endobj' ); - var expectedLinearizationDict = { + const expectedLinearizationDict = { length: 90, hints: [1388, 863], objectNumberFirst: 133, @@ -197,9 +172,9 @@ describe('parser', function() { }); it('should reject a linearization dictionary with invalid ' + - 'integer parameters', function () { + 'integer parameters', function() { // The /L parameter should be equal to the stream length. - var stream1 = new StringStream( + const stream1 = new StringStream( '1 0 obj\n' + '<<\n' + '/Linearized 1\n' + @@ -212,13 +187,13 @@ describe('parser', function() { '>>\n' + 'endobj' ); - expect(function () { + expect(function() { return Linearization.create(stream1); }).toThrow(new Error('The "L" parameter in the linearization ' + 'dictionary does not equal the stream length.')); // The /E parameter should not be zero. - var stream2 = new StringStream( + const stream2 = new StringStream( '1 0 obj\n' + '<<\n' + '/Linearized 1\n' + @@ -231,13 +206,13 @@ describe('parser', function() { '>>\n' + 'endobj' ); - expect(function () { + expect(function() { return Linearization.create(stream2); }).toThrow(new Error('The "E" parameter in the linearization ' + 'dictionary is invalid.')); // The /O parameter should be an integer. - var stream3 = new StringStream( + const stream3 = new StringStream( '1 0 obj\n' + '<<\n' + '/Linearized 1\n' + @@ -250,16 +225,16 @@ describe('parser', function() { '>>\n' + 'endobj' ); - expect(function () { + expect(function() { return Linearization.create(stream3); }).toThrow(new Error('The "O" parameter in the linearization ' + 'dictionary is invalid.')); }); it('should reject a linearization dictionary with invalid hint parameters', - function () { + function() { // The /H parameter should be an array. - var stream1 = new StringStream( + const stream1 = new StringStream( '1 0 obj\n' + '<<\n' + '/Linearized 1\n' + @@ -272,13 +247,13 @@ describe('parser', function() { '>>\n' + 'endobj' ); - expect(function () { + expect(function() { return Linearization.create(stream1); }).toThrow(new Error('Hint array in the linearization dictionary ' + 'is invalid.')); // The hint array should contain two, or four, elements. - var stream2 = new StringStream( + const stream2 = new StringStream( '1 0 obj\n' + '<<\n' + '/Linearized 1\n' + @@ -291,13 +266,13 @@ describe('parser', function() { '>>\n' + 'endobj' ); - expect(function () { + expect(function() { return Linearization.create(stream2); }).toThrow(new Error('Hint array in the linearization dictionary ' + 'is invalid.')); // The hint array should not contain zero. - var stream3 = new StringStream( + const stream3 = new StringStream( '1 0 obj\n' + '<<\n' + '/Linearized 1\n' + @@ -310,7 +285,7 @@ describe('parser', function() { '>>\n' + 'endobj' ); - expect(function () { + expect(function() { return Linearization.create(stream3); }).toThrow(new Error('Hint (2) in the linearization dictionary ' + 'is invalid.')); From 4a4b197b9d2ce1a20690837df64db8fd183c70ac Mon Sep 17 00:00:00 2001 From: Tim van der Meij Date: Sun, 10 Mar 2019 15:33:45 +0100 Subject: [PATCH 5/5] Write more unit tests for the lexer and the parser Moreover, group the lexer unit tests per method. This matches what we do for other classes and makes it more easily visible which methods we don't or insufficiently unit test. The parser itself is not unit tested yet, so this patch provides a start for doing so. The `inlineStreamSkipEI` method is used in other end marker detection methods, so it's important that its functionality is correct for proper parsing. --- test/unit/parser_spec.js | 258 +++++++++++++++++++++++++-------------- 1 file changed, 166 insertions(+), 92 deletions(-) diff --git a/test/unit/parser_spec.js b/test/unit/parser_spec.js index 0dde4385c..b26ed2dcf 100644 --- a/test/unit/parser_spec.js +++ b/test/unit/parser_spec.js @@ -14,110 +14,184 @@ */ /* eslint no-var: error */ -import { Lexer, Linearization } from '../../src/core/parser'; +import { Lexer, Linearization, Parser } from '../../src/core/parser'; import { FormatError } from '../../src/shared/util'; import { Name } from '../../src/core/primitives'; import { StringStream } from '../../src/core/stream'; describe('parser', function() { + describe('Parser', function() { + describe('inlineStreamSkipEI', function() { + it('should skip over the EI marker if it is found', function() { + const string = 'q 1 0 0 1 0 0 cm BI /W 10 /H 10 /BPC 1 ' + + '/F /A85 ID abc123~> EI Q'; + const input = new StringStream(string); + const lexer = new Lexer(input); + const parser = new Parser(lexer, /* allowStreams = */ true, + /* xref = */ null); + parser.inlineStreamSkipEI(input); + expect(input.pos).toEqual(string.indexOf('Q')); + expect(input.peekByte()).toEqual(0x51); // 'Q' + }); + + it('should skip to the end of stream if the EI marker is not found', + function() { + const string = 'q 1 0 0 1 0 0 cm BI /W 10 /H 10 /BPC 1 ' + + '/F /A85 ID abc123~> Q'; + const input = new StringStream(string); + const lexer = new Lexer(input); + const parser = new Parser(lexer, /* allowStreams = */ true, + /* xref = */ null); + parser.inlineStreamSkipEI(input); + expect(input.pos).toEqual(string.length); + expect(input.peekByte()).toEqual(-1); + }); + }); + }); + describe('Lexer', function() { - it('should stop parsing numbers at the end of stream', function() { - const input = new StringStream('11.234'); - const lexer = new Lexer(input); - expect(lexer.getNumber()).toEqual(11.234); - }); - - it('should parse PostScript numbers', function() { - const numbers = ['-.002', '34.5', '-3.62', '123.6e10', '1E-5', '-1.', - '0.0', '123', '-98', '43445', '0', '+17']; - for (const number of numbers) { - const input = new StringStream(number); + describe('nextChar', function() { + it('should return and set -1 when the end of the stream is reached', + function() { + const input = new StringStream(''); const lexer = new Lexer(input); - expect(lexer.getNumber()).toEqual(parseFloat(number)); - } - }); + expect(lexer.nextChar()).toEqual(-1); + expect(lexer.currentChar).toEqual(-1); + }); - it('should ignore double negative before number', function() { - const input = new StringStream('--205.88'); - const lexer = new Lexer(input); - expect(lexer.getNumber()).toEqual(-205.88); - }); - - it('should ignore minus signs in the middle of number', function() { - const input = new StringStream('205--.88'); - const lexer = new Lexer(input); - expect(lexer.getNumber()).toEqual(205.88); - }); - - it('should ignore line-breaks between operator and digit in number', - function() { - const minusInput = new StringStream('-\r\n205.88'); - const minusLexer = new Lexer(minusInput); - expect(minusLexer.getNumber()).toEqual(-205.88); - - const plusInput = new StringStream('+\r\n205.88'); - const plusLexer = new Lexer(plusInput); - expect(plusLexer.getNumber()).toEqual(205.88); - }); - - it('should treat a single decimal point as zero', function() { - const input = new StringStream('.'); - const lexer = new Lexer(input); - expect(lexer.getNumber()).toEqual(0); - - const numbers = ['..', '-.', '+.', '-\r\n.', '+\r\n.']; - for (const number of numbers) { - const input = new StringStream(number); + it('should return and set the character after the current position', + function() { + const input = new StringStream('123'); const lexer = new Lexer(input); - - expect(function() { - return lexer.getNumber(); - }).toThrowError(FormatError, /^Invalid number:\s/); - } + expect(lexer.nextChar()).toEqual(0x32); // '2' + expect(lexer.currentChar).toEqual(0x32); // '2' + }); }); - it('should handle glued numbers and operators', function() { - const input = new StringStream('123ET'); - const lexer = new Lexer(input); - expect(lexer.getNumber()).toEqual(123); - // The lexer must not have consumed the 'E' - expect(lexer.currentChar).toEqual(0x45); // 'E' - }); - - it('should stop parsing strings at the end of stream', function() { - const input = new StringStream('(1$4)'); - input.getByte = function(super_getByte) { - // Simulating end of file using null (see issue 2766). - const ch = super_getByte.call(input); - return (ch === 0x24 /* '$' */ ? -1 : ch); - }.bind(input, input.getByte); - const lexer = new Lexer(input); - expect(lexer.getString()).toEqual('1'); - }); - - it('should not throw exception on bad input', function() { - // '7 0 2 15 5 2 2 2 4 3 2 4' should be parsed as '70 21 55 22 24 32'. - const input = new StringStream('<7 0 2 15 5 2 2 2 4 3 2 4>'); - const lexer = new Lexer(input); - expect(lexer.getHexString()).toEqual('p!U"$2'); - }); - - it('should ignore escaped CR and LF', function() { - // '(\101\\102)' should be parsed as 'AB'. - const input = new StringStream('(\\101\\\r\n\\102\\\r\\103\\\n\\104)'); - const lexer = new Lexer(input); - expect(lexer.getString()).toEqual('ABCD'); - }); - - it('should handle Names with invalid usage of NUMBER SIGN (#)', function() { - const inputNames = ['/# 680 0 R', '/#AQwerty', '/#A<\102)' should be parsed as 'AB'. + const input = new StringStream('(\\101\\\r\n\\102\\\r\\103\\\n\\104)'); + const lexer = new Lexer(input); + expect(lexer.getString()).toEqual('ABCD'); + }); + }); + + describe('getHexString', function() { + it('should not throw exception on bad input', function() { + // '7 0 2 15 5 2 2 2 4 3 2 4' should be parsed as '70 21 55 22 24 32'. + const input = new StringStream('<7 0 2 15 5 2 2 2 4 3 2 4>'); + const lexer = new Lexer(input); + expect(lexer.getHexString()).toEqual('p!U"$2'); + }); + }); + + describe('getName', function() { + it('should handle Names with invalid usage of NUMBER SIGN (#)', + function() { + const inputNames = ['/# 680 0 R', '/#AQwerty', '/#A<