Merge pull request #15679 from Snuffleupagus/bug-1799927-2

Use the *full* inline image as the cacheKey in `Parser.makeInlineImage` (bug 1799927)
2022-11-10 22:54:48 +01:00 · 2022-11-10 22:54:48 +01:00 · 595711bd7c
commit 595711bd7c
parent 592d92424e e8ec6af73e
5 changed files with 41 additions and 36 deletions
--- a/src/core/parser.js
+++ b/src/core/parser.js
@ -40,27 +40,23 @@ import { PredictorStream } from "./predictor_stream.js";
 import { RunLengthStream } from "./run_length_stream.js";
 const MAX_LENGTH_TO_CACHE = 1000;
 const MAX_ADLER32_LENGTH = 5552;
-function computeAdler32(bytes) {
+function getInlineImageCacheKey(bytes) {
-  const bytesLength = bytes.length;
+  const strBuf = [],
-  if (
+    ii = bytes.length;
-    typeof PDFJSDev === "undefined" ||
+  let i = 0;
-    PDFJSDev.test("!PRODUCTION || TESTING")
+  while (i < ii - 1) {
-  ) {
+    strBuf.push((bytes[i++] << 8) | bytes[i++]);
    assert(
      bytesLength < MAX_ADLER32_LENGTH,
      'computeAdler32: Unsupported "bytes" length.'
    );
  }
-  let a = 1,
+  // Handle an odd number of elements.
-    b = 0;
+  if (i < ii) {
-  for (let i = 0; i < bytesLength; ++i) {
+    strBuf.push(bytes[i]);
    // No modulo required in the loop if `bytesLength < 5552`.
    a += bytes[i] & 0xff;
    b += a;
  }
-  return (b % 65521 << 16) | a % 65521;
+  // We purposely include the "raw" length in the cacheKey, to prevent any
  // possible issues with hash collisions in the inline image cache.
  // Here we also assume that `strBuf` is never larger than 8192 elements,
  // please refer to the `bytesToString` implementation.
  return ii + "_" + String.fromCharCode.apply(null, strBuf);
 }
 class Parser {
@ -71,6 +67,7 @@ class Parser {
    this.recoveryMode = recoveryMode;
    this.imageCache = Object.create(null);
    this._imageId = 0;
    this.refill();
  }
@ -483,8 +480,9 @@ class Parser {
    const lexer = this.lexer;
    const stream = lexer.stream;
-    // Parse dictionary.
+    // Parse dictionary, but initialize it lazily to improve performance with
-    const dict = new Dict(this.xref);
+    // cached inline images (see issue 2618).
    const dictMap = Object.create(null);
    let dictLength;
    while (!isCmd(this.buf1, "ID") && this.buf1 !== EOF) {
      if (!(this.buf1 instanceof Name)) {
@ -495,14 +493,14 @@ class Parser {
      if (this.buf1 === EOF) {
        break;
      }
-      dict.set(key, this.getObj(cipherTransform));
+      dictMap[key] = this.getObj(cipherTransform);
    }
    if (lexer.beginInlineImagePos !== -1) {
      dictLength = stream.pos - lexer.beginInlineImagePos;
    }
    // Extract the name of the first (i.e. the current) image filter.
-    const filter = dict.get("F", "Filter");
+    const filter = this.xref.fetchIfRef(dictMap.F || dictMap.Filter);
    let filterName;
    if (filter instanceof Name) {
      filterName = filter.name;
@ -532,25 +530,19 @@ class Parser {
      default:
        length = this.findDefaultInlineStreamEnd(stream);
    }
    let imageStream = stream.makeSubStream(startPos, length, dict);
    // Cache all images below the MAX_LENGTH_TO_CACHE threshold by their
-    // adler32 checksum.
+    // stringified content, to prevent possible hash collisions.
    let cacheKey;
-    if (length < MAX_LENGTH_TO_CACHE && dictLength < MAX_ADLER32_LENGTH) {
+    if (length < MAX_LENGTH_TO_CACHE && dictLength > 0) {
      const imageBytes = imageStream.getBytes();
      imageStream.reset();
      const initialStreamPos = stream.pos;
      // Set the stream position to the beginning of the dictionary data...
      stream.pos = lexer.beginInlineImagePos;
-      // ... and fetch the bytes of the *entire* dictionary.
+      // ... and fetch the bytes of the dictionary *and* the inline image.
-      const dictBytes = stream.getBytes(dictLength);
+      cacheKey = getInlineImageCacheKey(stream.getBytes(dictLength + length));
      // Finally, don't forget to reset the stream position.
      stream.pos = initialStreamPos;
      cacheKey = computeAdler32(imageBytes) + "_" + computeAdler32(dictBytes);
      const cacheEntry = this.imageCache[cacheKey];
      if (cacheEntry !== undefined) {
        this.buf2 = Cmd.get("EI");
@ -561,6 +553,11 @@ class Parser {
      }
    }
    const dict = new Dict(this.xref);
    for (const key in dictMap) {
      dict.set(key, dictMap[key]);
    }
    let imageStream = stream.makeSubStream(startPos, length, dict);
    if (cipherTransform) {
      imageStream = cipherTransform.createStream(imageStream, length);
    }
@ -568,7 +565,7 @@ class Parser {
    imageStream = this.filter(imageStream, dict, length);
    imageStream.dict = dict;
    if (cacheKey !== undefined) {
-      imageStream.cacheKey = `inline_${length}_${cacheKey}`;
+      imageStream.cacheKey = `inline_img_${++this._imageId}`;
      this.imageCache[cacheKey] = imageStream;
    }
--- a/src/shared/murmurhash3.js
+++ b/src/shared/murmurhash3.js
@ -130,9 +130,10 @@ class MurmurHash3_64 {
      (((((h2 << 16) | (h1 >>> 16)) * 0xb9fe1a85) & MASK_HIGH) >>> 16);
    h1 ^= h2 >>> 1;
-    const hex1 = (h1 >>> 0).toString(16),
+    return (
-      hex2 = (h2 >>> 0).toString(16);
+      (h1 >>> 0).toString(16).padStart(8, "0") +
-    return hex1.padStart(8, "0") + hex2.padStart(8, "0");
+      (h2 >>> 0).toString(16).padStart(8, "0")
    );
  }
 }
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -534,6 +534,7 @@
 !issue14415.pdf
 !issue14307.pdf
 !issue14497.pdf
 !bug1799927.pdf
 !issue14502.pdf
 !issue13211.pdf
 !issue14627.pdf
--- a/test/pdfs/bug1799927.pdf
+++ b/test/pdfs/bug1799927.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -5921,6 +5921,12 @@
       "annotations": true,
       "type": "eq"
    },
    {  "id": "bug1799927",
       "file": "pdfs/bug1799927.pdf",
       "md5": "e6ad013c24e58e5b40c3bae50f04c8e8",
       "rounds": 1,
       "type": "eq"
    },
    {  "id": "annotation-line-without-appearance-empty-Rect",
       "file": "pdfs/annotation-line-without-appearance-empty-Rect.pdf",
       "md5": "65f2d3ef80acfea637718c3fc66043b7",