From b5254f274588839d92b28df3f9d89344351b6d8c Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Thu, 18 Jul 2019 16:24:25 +0200
Subject: [PATCH] Attempt to significantly reduce the number of
 `ChunkedStream.{ensureByte, ensureRange}` calls by inlining the
 `this.progressiveDataLength` checks at the call-sites

The number of in particular `ChunkedStream.ensureByte` calls is often absolutely *huge* (on the order of million calls) when loading and rendering even moderately complicated PDF files, which isn't entirely surprising considering that the `getByte`/`getBytes`/`peekByte`/`peekBytes` methods are used for essentially all data reading/parsing.

The idea implemented in this patch is to inline an inverted `progressiveDataLength` check at all of the `ensureByte`/`ensureRange` call-sites, which in practice will often result in *several* orders of magnitude fewer function calls.
Obviously this patch will only help if the browser supports streaming, which all reasonably modern browsers now do (including the Firefox built-in PDF viewer), and assuming that the user didn't set the `disableStream` option (e.g. for using `disableAutoFetch`). However, I think we should be able to improve performance for the default out-of-the-box use case, without worrying about e.g. older browsers (where this patch will thus incur *one* additional check before calling `ensureByte`/`ensureRange`).

This patch was inspired by the *first* commit in PR 5005, which was subsequently backed out in PR 5145 for causing regressions. Since the general idea of avoiding unnecessary function calls was really nice, I figured that re-attempting this in one way or another wouldn't be a bad idea.
Given that streaming is now supported, which it wasn't back then, using `progressiveDataLength` seemed like an easier approach in general since it also allowed supporting both `ensureByte` and `ensureRange`.

This sort of patch obviously needs data to back it up, hence I've benchmarked the changes using the following manifest file (with the default `tracemonkey` file):
```
[
    {  "id": "tracemonkey-eq",
       "file": "pdfs/tracemonkey.pdf",
       "md5": "9a192d8b1a7dc652a19835f6f08098bd",
       "rounds": 250,
       "type": "eq"
    }
]
```

I get the following complete results when comparing this patch against the `master` branch:
```
-- Grouped By browser, stat --
browser | stat         | Count | Baseline(ms) | Current(ms) | +/- |    %  | Result(P<.05)
------- | ------------ | ----- | ------------ | ----------- | --- | ----- | -------------
Firefox | Overall      |  3500 |          140 |         134 |  -6 | -4.46 |        faster
Firefox | Page Request |  3500 |            2 |           2 |   0 | -0.10 |
Firefox | Rendering    |  3500 |          138 |         131 |  -6 | -4.54 |        faster
```

Here it's pretty clear that the patch does have a positive net effect, even for a PDF file of fairly moderate size and complexity. However, in this case it's probably interesting to also look at the results per page:
```
-- Grouped By page, stat --
page | stat         | Count | Baseline(ms) | Current(ms) | +/- |     %  | Result(P<.05)
---- | ------------ | ----- | ------------ | ----------- | --- | ------ | -------------
0    | Overall      |   250 |           74 |          75 |   1 |   0.69 |
0    | Page Request |   250 |            1 |           1 |   0 |  33.20 |
0    | Rendering    |   250 |           73 |          74 |   0 |   0.25 |
1    | Overall      |   250 |          123 |         121 |  -2 |  -1.87 |        faster
1    | Page Request |   250 |            3 |           2 |   0 | -11.73 |
1    | Rendering    |   250 |          121 |         119 |  -2 |  -1.67 |
2    | Overall      |   250 |           64 |          63 |  -1 |  -1.91 |
2    | Page Request |   250 |            1 |           1 |   0 |   8.81 |
2    | Rendering    |   250 |           63 |          62 |  -1 |  -2.13 |        faster
3    | Overall      |   250 |           97 |          97 |   0 |  -0.06 |
3    | Page Request |   250 |            1 |           1 |   0 |  25.37 |
3    | Rendering    |   250 |           96 |          95 |   0 |  -0.34 |
4    | Overall      |   250 |           97 |          97 |   0 |  -0.38 |
4    | Page Request |   250 |            1 |           1 |   0 |  -5.97 |
4    | Rendering    |   250 |           96 |          96 |   0 |  -0.27 |
5    | Overall      |   250 |           99 |          97 |  -3 |  -2.92 |
5    | Page Request |   250 |            2 |           1 |   0 | -17.20 |
5    | Rendering    |   250 |           98 |          95 |  -3 |  -2.68 |
6    | Overall      |   250 |           99 |          99 |   0 |  -0.14 |
6    | Page Request |   250 |            2 |           2 |   0 | -16.49 |
6    | Rendering    |   250 |           97 |          98 |   0 |   0.16 |
7    | Overall      |   250 |           96 |          95 |  -1 |  -0.55 |
7    | Page Request |   250 |            1 |           2 |   1 |  66.67 |        slower
7    | Rendering    |   250 |           95 |          94 |  -1 |  -1.19 |
8    | Overall      |   250 |           92 |          92 |  -1 |  -0.69 |
8    | Page Request |   250 |            1 |           1 |   0 | -17.60 |
8    | Rendering    |   250 |           91 |          91 |   0 |  -0.52 |
9    | Overall      |   250 |          112 |         112 |   0 |   0.29 |
9    | Page Request |   250 |            2 |           1 |   0 |  -7.92 |
9    | Rendering    |   250 |          110 |         111 |   0 |   0.37 |
10   | Overall      |   250 |          589 |         522 | -67 | -11.38 |        faster
10   | Page Request |   250 |           14 |          13 |   0 |  -1.26 |
10   | Rendering    |   250 |          575 |         508 | -67 | -11.62 |        faster
11   | Overall      |   250 |           66 |          66 |  -1 |  -0.86 |
11   | Page Request |   250 |            1 |           1 |   0 | -16.48 |
11   | Rendering    |   250 |           65 |          65 |   0 |  -0.62 |
12   | Overall      |   250 |          303 |         291 | -12 |  -4.07 |        faster
12   | Page Request |   250 |            2 |           2 |   0 |  12.93 |
12   | Rendering    |   250 |          301 |         289 | -13 |  -4.19 |        faster
13   | Overall      |   250 |           48 |          47 |   0 |  -0.45 |
13   | Page Request |   250 |            1 |           1 |   0 |   1.59 |
13   | Rendering    |   250 |           47 |          46 |   0 |  -0.52 |
```

Here it's clear that this patch *significantly* improves the rendering performance of the slowest pages, while not causing any big regressions elsewhere. As expected, this patch thus helps larger and/or more complex pages the most (which is also where even small improvements will be most beneficial).
There's obviously the question if this is *slightly* regressing simpler pages, but given just how short the times are in most cases it's not inconceivable that the page results above are simply caused be e.g. limited `Date.now()` and/or limited numerical precision.
---
 src/core/chunked_stream.js | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/core/chunked_stream.js b/src/core/chunked_stream.js
index 39c1cb677..a0f29ec74 100644
--- a/src/core/chunked_stream.js
+++ b/src/core/chunked_stream.js
@@ -159,7 +159,9 @@ class ChunkedStream {
     if (pos >= this.end) {
       return -1;
     }
-    this.ensureByte(pos);
+    if (pos >= this.progressiveDataLength) {
+      this.ensureByte(pos);
+    }
     return this.bytes[this.pos++];
   }
 
@@ -187,7 +189,9 @@ class ChunkedStream {
     const strEnd = this.end;
 
     if (!length) {
-      this.ensureRange(pos, strEnd);
+      if (strEnd > this.progressiveDataLength) {
+        this.ensureRange(pos, strEnd);
+      }
       const subarray = bytes.subarray(pos, strEnd);
       // `this.bytes` is always a `Uint8Array` here.
       return (forceClamped ? new Uint8ClampedArray(subarray) : subarray);
@@ -197,7 +201,9 @@ class ChunkedStream {
     if (end > strEnd) {
       end = strEnd;
     }
-    this.ensureRange(pos, end);
+    if (end > this.progressiveDataLength) {
+      this.ensureRange(pos, end);
+    }
 
     this.pos = end;
     const subarray = bytes.subarray(pos, end);
@@ -224,7 +230,9 @@ class ChunkedStream {
     if (end > this.end) {
       end = this.end;
     }
-    this.ensureRange(begin, end);
+    if (end > this.progressiveDataLength) {
+      this.ensureRange(begin, end);
+    }
     return this.bytes.subarray(begin, end);
   }
 
@@ -245,7 +253,9 @@ class ChunkedStream {
 
   makeSubStream(start, length, dict) {
     if (length) {
-      this.ensureRange(start, start + length);
+      if (start + length > this.progressiveDataLength) {
+        this.ensureRange(start, start + length);
+      }
     } else {
       // When the `length` is undefined you do *not*, under any circumstances,
       // want to fallback on calling `this.ensureRange(start, this.end)` since
@@ -256,7 +266,9 @@ class ChunkedStream {
       // time/resources during e.g. parsing, since `MissingDataException`s will
       // require data to be re-parsed, which we attempt to minimize by at least
       // checking that the *beginning* of the data is available here.
-      this.ensureByte(start);
+      if (start >= this.progressiveDataLength) {
+        this.ensureByte(start);
+      }
     }
 
     function ChunkedStreamSubstream() {}