From 49b8cd5a6afd3c0d63dec7b73e0c406807610392 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sat, 26 Aug 2017 12:09:49 +0200 Subject: [PATCH] Attempt to improve the `EI` detection heuristics, for inline images, in streams containing `NUL` bytes (issue 8823) Since this patch will now treat (some) `NUL` bytes as "ASCII", the number of `followingBytes` checked are thus increased to (hopefully) reduce the risk of introducing new false positives. Fixes 8823. --- src/core/parser.js | 20 +++++++++++++++++--- test/pdfs/.gitignore | 1 + test/pdfs/issue8823.pdf | Bin 0 -> 1771 bytes test/test_manifest.json | 10 ++++++++-- 4 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 test/pdfs/issue8823.pdf diff --git a/src/core/parser.js b/src/core/parser.js index db0c4533d..954bec8ba 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -148,7 +148,8 @@ var Parser = (function ParserClosure() { * @returns {number} The inline stream length. */ findDefaultInlineStreamEnd(stream) { - const E = 0x45, I = 0x49, SPACE = 0x20, LF = 0xA, CR = 0xD, n = 5; + const E = 0x45, I = 0x49, SPACE = 0x20, LF = 0xA, CR = 0xD; + const n = 10, NUL = 0x0; let startPos = stream.pos, state = 0, ch, maybeEIPos; while ((ch = stream.getByte()) !== -1) { if (state === 0) { @@ -159,10 +160,23 @@ var Parser = (function ParserClosure() { assert(state === 2); if (ch === SPACE || ch === LF || ch === CR) { maybeEIPos = stream.pos; - // Let's check the next `n` bytes are ASCII... just be sure. + // Let's check that the next `n` bytes are ASCII... just to be sure. let followingBytes = stream.peekBytes(n); - for (let i = 0; i < n; i++) { + for (let i = 0, ii = followingBytes.length; i < ii; i++) { ch = followingBytes[i]; + if (ch === NUL && followingBytes[i + 1] !== NUL) { + // NUL bytes are not supposed to occur *outside* of inline + // images, but some PDF generators violate that assumption, + // thus breaking the EI detection heuristics used below. + // + // However, we can't unconditionally treat NUL bytes as "ASCII", + // since that *could* result in inline images being truncated. + // + // To attempt to address this, we'll still treat any *sequence* + // of NUL bytes as non-ASCII, but for a *single* NUL byte we'll + // continue checking the `followingBytes` (fixes issue8823.pdf). + continue; + } if (ch !== LF && ch !== CR && (ch < SPACE || ch > 0x7F)) { // Not a LF, CR, SPACE or any visible ASCII character, i.e. // it's binary stuff. Resetting the state. diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 8e19cdd28..78c790ade 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -58,6 +58,7 @@ !issue8697.pdf !issue8707.pdf !issue8798r.pdf +!issue8823.pdf !bad-PageLabels.pdf !filled-background.pdf !ArabicCIDTrueType.pdf diff --git a/test/pdfs/issue8823.pdf b/test/pdfs/issue8823.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9878a62ba342cf5336d6b80ddc9a7f86bedd7d12 GIT binary patch literal 1771 zcmdT_&2G~`5YBm|yu(~_r2FT!9Vv=b#|cp#sKJz%f%WrcI))z5*A%5_nrcwp#v4h?~w z*kx{#@&@Pph@*c(A?Y3U=}n%d##wT;iG%zWH7zR)nz-1^R^{3)C{W!9+CaU+_J40| zv8q3z5OLo}RMpO!t1euF@~6R3MkqALfOA}2L6QNo52yr9uV{~^(HXMTe$5y5r8BoA z$WE|!YfE3rbVN9_6u#5w(D)Z~7seTOPO)>19Sr^1aMVEqLO&Z2gaJ+|2}1f@aTGjm z5QNVTK}#d`I2xrm_Y6FLdPZ%N(UQqSc?6mMKPV4*<-r{AYE#|&#m#m|c2)i~MZQT% z*LU7c$J+K_-|Bw1wzl8hIC}vE68LUlsI}5~(cQ@enfif6D~sX-nINluJsCv%G7