When parsing corrupt documents, avoid inserting obviously broken data in the XRef-table (issue 13783)

In cases where even the very *first* attempt at reading from an object will throw, simply ignoring such objects will help improve rendering of *some* corrupt documents. Note that this will lead to more parsing in some cases, but considering that this only applies to *corrupt* documents that shouldn't be a big deal.
2021-07-23 17:37:55 +02:00 · 2021-07-23 17:37:55 +02:00 · b82c802dff
commit b82c802dff
parent 51f0a81085
5 changed files with 51 additions and 10 deletions
--- a/src/core/core_utils.js
+++ b/src/core/core_utils.js
@ -58,6 +58,8 @@ class MissingDataException extends BaseException {
  }
 }

+class ParserEOFException extends BaseException {}
+
 class XRefEntryException extends BaseException {}

 class XRefParseException extends BaseException {}
@ -450,6 +452,7 @@ export {
  isWhiteSpace,
  log2,
  MissingDataException,
+  ParserEOFException,
  parseXFAPath,
  readInt8,
  readUint16,
--- a/src/core/parser.js
+++ b/src/core/parser.js
@ -33,7 +33,11 @@ import {
  Name,
  Ref,
 } from "./primitives.js";
-import { isWhiteSpace, MissingDataException } from "./core_utils.js";
+import {
+  isWhiteSpace,
+  MissingDataException,
+  ParserEOFException,
+} from "./core_utils.js";
 import { Ascii85Stream } from "./ascii_85_stream.js";
 import { AsciiHexStream } from "./ascii_hex_stream.js";
 import { CCITTFaxStream } from "./ccitt_stream.js";
@ -124,10 +128,10 @@ class Parser {
            array.push(this.getObj(cipherTransform));
          }
          if (isEOF(this.buf1)) {
-            if (!this.recoveryMode) {
-              throw new FormatError("End of file inside array");
+            if (this.recoveryMode) {
+              return array;
            }
-            return array;
+            throw new ParserEOFException("End of file inside array.");
          }
          this.shift();
          return array;
@ -148,10 +152,10 @@ class Parser {
            dict.set(key, this.getObj(cipherTransform));
          }
          if (isEOF(this.buf1)) {
-            if (!this.recoveryMode) {
-              throw new FormatError("End of file inside dictionary");
+            if (this.recoveryMode) {
+              return dict;
            }
-            return dict;
+            throw new ParserEOFException("End of file inside dictionary.");
          }

          // Stream objects are not allowed inside content streams or
--- a/src/core/xref.js
+++ b/src/core/xref.js
@ -33,6 +33,7 @@ import {
 import { Lexer, Parser } from "./parser.js";
 import {
  MissingDataException,
+  ParserEOFException,
  XRefEntryException,
  XRefParseException,
 } from "./core_utils.js";
@ -453,15 +454,38 @@ class XRef {
      } else if ((m = objRegExp.exec(token))) {
        const num = m[1] | 0,
          gen = m[2] | 0;
-        if (!this.entries[num] || this.entries[num].gen === gen) {
+
+        let contentLength,
+          startPos = position + token.length,
+          updateEntries = false;
+        if (!this.entries[num]) {
+          updateEntries = true;
+        } else if (this.entries[num].gen === gen) {
+          // Before overwriting an existing entry, ensure that the new one won't
+          // cause *immediate* errors when it's accessed (fixes issue13783.pdf).
+          try {
+            const parser = new Parser({
+              lexer: new Lexer(stream.makeSubStream(startPos)),
+            });
+            parser.getObj();
+            updateEntries = true;
+          } catch (ex) {
+            if (ex instanceof ParserEOFException) {
+              warn(`indexObjects -- checking object (${token}): "${ex}".`);
+            } else {
+              // The error may come from the `Parser`-instance being initialized
+              // without an `XRef`-instance (we don't have a usable one yet).
+              updateEntries = true;
+            }
+          }
+        }
+        if (updateEntries) {
          this.entries[num] = {
            offset: position - stream.start,
            gen,
            uncompressed: true,
          };
        }
-        let contentLength,
-          startPos = position + token.length;

        // Find the next "obj" string, rather than "endobj", to ensure that
        // we won't skip over a new 'obj' operator in corrupt files where
--- a/test/pdfs/issue13783.pdf.link
+++ b/test/pdfs/issue13783.pdf.link
@ -0,0 +1 @@
+https://github.com/mozilla/pdf.js/files/6869824/TimeTravel.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -1382,6 +1382,15 @@
       "enableXfa": true,
       "type": "eq"
    },
+    {  "id": "issue13783",
+       "file": "pdfs/issue13783.pdf",
+       "md5": "6958d827afa566efbd82f53271ea5cd6",
+       "link": true,
+       "rounds": 1,
+       "firstPage": 7,
+       "lastPage": 7,
+       "type": "eq"
+    },
    {  "id": "issue9262",
       "file": "pdfs/issue9262_reduced.pdf",
       "md5": "5347ce2d7b3866625c22e115fd90e0de",
				`@ -0,0 +1 @@`
				`https://github.com/mozilla/pdf.js/files/6869824/TimeTravel.pdf`