From f05e5c54601a261e21d6cae77b1f3090b8d30a0c Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Tue, 30 Jan 2018 12:26:33 +0100
Subject: [PATCH] Take the dictionary, and not just the image data, into
 account when caching inline images (issue 9398)

The reason for the bug is that we're only computing a checksum of the image data itself, but completely ignore the inline dictionary. The latter is important, since in practice it's not uncommon for inline images to be identical but use e.g. different ColourSpaces.

There's obviously a couple of different ways that we could compute a hash/checksum of the dictionary.
Initially I tried using `MurmurHash3_64` to compute a hash of the keys/values in the dictionary. Unfortunately this approach turned out to be *way* too slow in practice, especially for PDF files with a huge number of inline images; in particular issue 2618 would regresses quite badly with this solution.

The solution that is instead implemented in this patch, is to compute a checksum of the dictionary contents. While this is a much simpler, not to mention a lot more efficient, solution there's one drawback associated with it:
If the contents of inline image dictionaries are ordered differently, they will not be considered equal with this approach which could thus lead to failures to cache repeated inline images. In practice this doesn't seem to be a problem in any of the PDF files I've tested, and generally I'd rather err on the side of *not* caching given that too aggressive caching can easily lead to rendering bugs.

One small, but somewhat annoying, complication is that by the time `Parser.makeInlineImage` is called, we no longer know the *exact* stream position where the inline image dictionary starts. Having access to that information is crucial here, and the easiest solution I could come up with is to track this in the current `Lexer` instance.[1]

With the patch, we're thus able to fix the referenced issues without incurring large regressions in problematic cases such as issue 2618.

Fixes 9398; also improves/fixes the `issue8823` reference test.

---

[1] Obviously I'd have preferred if this patch could be limited to `Parser.makeInlineImage`, without the need for this "hack", but I'm not sure what that'd look like here.
---
 src/core/parser.js      | 62 ++++++++++++++++++++++++++++++-----------
 test/test_manifest.json |  3 +-
 2 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/src/core/parser.js b/src/core/parser.js
index e6e60cd49..7f531ec0d 100644
--- a/src/core/parser.js
+++ b/src/core/parser.js
@@ -29,7 +29,22 @@ import { Jbig2Stream } from './jbig2_stream';
 import { JpegStream } from './jpeg_stream';
 import { JpxStream } from './jpx_stream';
 
-var MAX_LENGTH_TO_CACHE = 1000;
+const MAX_LENGTH_TO_CACHE = 1000;
+const MAX_ADLER32_LENGTH = 5552;
+
+function computeAdler32(bytes) {
+  let bytesLength = bytes.length;
+  if (bytesLength >= MAX_ADLER32_LENGTH) {
+    throw new Error('computeAdler32: The input is too large.');
+  }
+  let a = 1, b = 0;
+  for (let i = 0; i < bytesLength; ++i) {
+    // No modulo required in the loop if `bytesLength < 5552`.
+    a += bytes[i] & 0xFF;
+    b += a;
+  }
+  return ((b % 65521) << 16) | (a % 65521);
+}
 
 var Parser = (function ParserClosure() {
   function Parser(lexer, allowStreams, xref, recoveryMode) {
@@ -371,7 +386,7 @@ var Parser = (function ParserClosure() {
       var stream = lexer.stream;
 
       // Parse dictionary.
-      var dict = new Dict(this.xref);
+      let dict = new Dict(this.xref), dictLength;
       while (!isCmd(this.buf1, 'ID') && !isEOF(this.buf1)) {
         if (!isName(this.buf1)) {
           throw new FormatError('Dictionary key must be a name object');
@@ -383,6 +398,9 @@ var Parser = (function ParserClosure() {
         }
         dict.set(key, this.getObj(cipherTransform));
       }
+      if (lexer.beginInlineImagePos !== -1) {
+        dictLength = stream.pos - lexer.beginInlineImagePos;
+      }
 
       // Extract the name of the first (i.e. the current) image filter.
       var filter = dict.get('Filter', 'F'), filterName;
@@ -396,7 +414,7 @@ var Parser = (function ParserClosure() {
       }
 
       // Parse image stream.
-      var startPos = stream.pos, length, i, ii;
+      let startPos = stream.pos, length;
       if (filterName === 'DCTDecode' || filterName === 'DCT') {
         length = this.findDCTDecodeInlineStreamEnd(stream);
       } else if (filterName === 'ASCII85Decode' || filterName === 'A85') {
@@ -410,21 +428,22 @@ var Parser = (function ParserClosure() {
 
       // Cache all images below the MAX_LENGTH_TO_CACHE threshold by their
       // adler32 checksum.
-      var adler32;
-      if (length < MAX_LENGTH_TO_CACHE) {
+      let cacheKey;
+      if (length < MAX_LENGTH_TO_CACHE && dictLength < MAX_ADLER32_LENGTH) {
         var imageBytes = imageStream.getBytes();
         imageStream.reset();
 
-        var a = 1;
-        var b = 0;
-        for (i = 0, ii = imageBytes.length; i < ii; ++i) {
-          // No modulo required in the loop if imageBytes.length < 5552.
-          a += imageBytes[i] & 0xff;
-          b += a;
-        }
-        adler32 = ((b % 65521) << 16) | (a % 65521);
+        const initialStreamPos = stream.pos;
+        // Set the stream position to the beginning of the dictionary data...
+        stream.pos = lexer.beginInlineImagePos;
+        // ... and fetch the bytes of the *entire* dictionary.
+        let dictBytes = stream.getBytes(dictLength);
+        // Finally, don't forget to reset the stream position.
+        stream.pos = initialStreamPos;
 
-        let cacheEntry = this.imageCache[adler32];
+        cacheKey = computeAdler32(imageBytes) + '_' + computeAdler32(dictBytes);
+
+        let cacheEntry = this.imageCache[cacheKey];
         if (cacheEntry !== undefined) {
           this.buf2 = Cmd.get('EI');
           this.shift();
@@ -440,9 +459,9 @@ var Parser = (function ParserClosure() {
 
       imageStream = this.filter(imageStream, dict, length);
       imageStream.dict = dict;
-      if (adler32 !== undefined) {
-        imageStream.cacheKey = 'inline_' + length + '_' + adler32;
-        this.imageCache[adler32] = imageStream;
+      if (cacheKey !== undefined) {
+        imageStream.cacheKey = 'inline_' + length + '_' + cacheKey;
+        this.imageCache[cacheKey] = imageStream;
       }
 
       this.buf2 = Cmd.get('EI');
@@ -653,6 +672,8 @@ var Lexer = (function LexerClosure() {
     // 'fa', 'fal', 'fals'. The prefixes are not needed, if the command has no
     // other commands or literals as a prefix. The knowCommands is optional.
     this.knownCommands = knownCommands;
+
+    this.beginInlineImagePos = -1;
   }
 
   // A '1' in this array means the character is white space. A '1' or
@@ -1047,6 +1068,13 @@ var Lexer = (function LexerClosure() {
       if (str === 'null') {
         return null;
       }
+
+      if (str === 'BI') {
+        // Keep track of the current stream position, since it's needed in order
+        // to correctly cache inline images; see `Parser.makeInlineImage`.
+        this.beginInlineImagePos = this.stream.pos;
+      }
+
       return Cmd.get(str);
     },
     skipToNextLine: function Lexer_skipToNextLine() {
diff --git a/test/test_manifest.json b/test/test_manifest.json
index 094007cae..0906ffd9a 100644
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@@ -3192,7 +3192,8 @@
        "md5": "ad02d4aa374b315bf1766038d002d57a",
        "link": false,
        "rounds": 1,
-       "type": "eq"
+       "type": "eq",
+       "about": "Also tests issue9398."
     },
     {  "id": "issue8613",
        "file": "pdfs/issue8613.pdf",