Use the *full* inline image as the cacheKey in Parser.makeInlineImage (bug 1799927)

*Please note:* This only fixes the "wrong letter" part of bug 1799927. It appears that the simple `computeAdler32` function, used when caching inline images, generates hash collisions for some (very short) TypedArrays. In this case that leads to some of the "letters", which are actually inline images, being rendered incorrectly. Rather than switching to another hashing algorithm, e.g. the `MurmurHash3_64` class, we simply cache using a stringified version of the inline image data as the cacheKey to prevent any future collisions. While this will (naturally) lead to slightly higher peak memory usage, it'll however be limited to the current `Parser`-instance which means that it's not persistent. One small benefit of these changes is that we can avoid creating lots of `Stream`-instances for already cached inline images.
2022-11-10 14:00:23 +01:00 · 2022-11-10 14:00:23 +01:00 · b46e0d61cf
commit b46e0d61cf
parent f7449563ef
4 changed files with 28 additions and 29 deletions
--- a/src/core/parser.js
+++ b/src/core/parser.js
@ -40,27 +40,23 @@ import { PredictorStream } from "./predictor_stream.js";
 import { RunLengthStream } from "./run_length_stream.js";
 const MAX_LENGTH_TO_CACHE = 1000;
 const MAX_ADLER32_LENGTH = 5552;
-function computeAdler32(bytes) {
+function getInlineImageCacheKey(bytes) {
-  const bytesLength = bytes.length;
+  const strBuf = [],
-  if (
+    ii = bytes.length;
-    typeof PDFJSDev === "undefined" ||
+  let i = 0;
-    PDFJSDev.test("!PRODUCTION || TESTING")
+  while (i < ii - 1) {
-  ) {
+    strBuf.push((bytes[i++] << 8) | bytes[i++]);
    assert(
      bytesLength < MAX_ADLER32_LENGTH,
      'computeAdler32: Unsupported "bytes" length.'
    );
  }
-  let a = 1,
+  // Handle an odd number of elements.
-    b = 0;
+  if (i < ii) {
-  for (let i = 0; i < bytesLength; ++i) {
+    strBuf.push(bytes[i]);
    // No modulo required in the loop if `bytesLength < 5552`.
    a += bytes[i] & 0xff;
    b += a;
  }
-  return (b % 65521 << 16) | a % 65521;
+  // We purposely include the "raw" length in the cacheKey, to prevent any
  // possible issues with hash collisions in the inline image cache.
  // Here we also assume that `strBuf` is never larger than 8192 elements,
  // please refer to the `bytesToString` implementation.
  return ii + "_" + String.fromCharCode.apply(null, strBuf);
 }
 class Parser {
@ -71,6 +67,7 @@ class Parser {
    this.recoveryMode = recoveryMode;
    this.imageCache = Object.create(null);
    this._imageId = 0;
    this.refill();
  }
@ -532,25 +529,19 @@ class Parser {
      default:
        length = this.findDefaultInlineStreamEnd(stream);
    }
    let imageStream = stream.makeSubStream(startPos, length, dict);
    // Cache all images below the MAX_LENGTH_TO_CACHE threshold by their
-    // adler32 checksum.
+    // stringified content, to prevent possible hash collisions.
    let cacheKey;
-    if (length < MAX_LENGTH_TO_CACHE && dictLength < MAX_ADLER32_LENGTH) {
+    if (length < MAX_LENGTH_TO_CACHE && dictLength > 0) {
      const imageBytes = imageStream.getBytes();
      imageStream.reset();
      const initialStreamPos = stream.pos;
      // Set the stream position to the beginning of the dictionary data...
      stream.pos = lexer.beginInlineImagePos;
-      // ... and fetch the bytes of the *entire* dictionary.
+      // ... and fetch the bytes of the dictionary *and* the inline image.
-      const dictBytes = stream.getBytes(dictLength);
+      cacheKey = getInlineImageCacheKey(stream.getBytes(dictLength + length));
      // Finally, don't forget to reset the stream position.
      stream.pos = initialStreamPos;
      cacheKey = computeAdler32(imageBytes) + "_" + computeAdler32(dictBytes);
      const cacheEntry = this.imageCache[cacheKey];
      if (cacheEntry !== undefined) {
        this.buf2 = Cmd.get("EI");
@ -561,6 +552,7 @@ class Parser {
      }
    }
    let imageStream = stream.makeSubStream(startPos, length, dict);
    if (cipherTransform) {
      imageStream = cipherTransform.createStream(imageStream, length);
    }
@ -568,7 +560,7 @@ class Parser {
    imageStream = this.filter(imageStream, dict, length);
    imageStream.dict = dict;
    if (cacheKey !== undefined) {
-      imageStream.cacheKey = `inline_${length}_${cacheKey}`;
+      imageStream.cacheKey = `inline_img_${++this._imageId}`;
      this.imageCache[cacheKey] = imageStream;
    }
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -534,6 +534,7 @@
 !issue14415.pdf
 !issue14307.pdf
 !issue14497.pdf
 !bug1799927.pdf
 !issue14502.pdf
 !issue13211.pdf
 !issue14627.pdf
--- a/test/pdfs/bug1799927.pdf
+++ b/test/pdfs/bug1799927.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -5921,6 +5921,12 @@
       "annotations": true,
       "type": "eq"
    },
    {  "id": "bug1799927",
       "file": "pdfs/bug1799927.pdf",
       "md5": "e6ad013c24e58e5b40c3bae50f04c8e8",
       "rounds": 1,
       "type": "eq"
    },
    {  "id": "annotation-line-without-appearance-empty-Rect",
       "file": "pdfs/annotation-line-without-appearance-empty-Rect.pdf",
       "md5": "65f2d3ef80acfea637718c3fc66043b7",