From b82c802dff286c593a0f1c40a3a5139ef57e5bb9 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 23 Jul 2021 17:37:55 +0200 Subject: [PATCH] When parsing corrupt documents, avoid inserting obviously broken data in the XRef-table (issue 13783) In cases where even the very *first* attempt at reading from an object will throw, simply ignoring such objects will help improve rendering of *some* corrupt documents. Note that this will lead to more parsing in some cases, but considering that this only applies to *corrupt* documents that shouldn't be a big deal. --- src/core/core_utils.js | 3 +++ src/core/parser.js | 18 +++++++++++------- src/core/xref.js | 30 +++++++++++++++++++++++++++--- test/pdfs/issue13783.pdf.link | 1 + test/test_manifest.json | 9 +++++++++ 5 files changed, 51 insertions(+), 10 deletions(-) create mode 100644 test/pdfs/issue13783.pdf.link diff --git a/src/core/core_utils.js b/src/core/core_utils.js index d074661c7..253a349bc 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -58,6 +58,8 @@ class MissingDataException extends BaseException { } } +class ParserEOFException extends BaseException {} + class XRefEntryException extends BaseException {} class XRefParseException extends BaseException {} @@ -450,6 +452,7 @@ export { isWhiteSpace, log2, MissingDataException, + ParserEOFException, parseXFAPath, readInt8, readUint16, diff --git a/src/core/parser.js b/src/core/parser.js index 5671d1546..85cb1faab 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -33,7 +33,11 @@ import { Name, Ref, } from "./primitives.js"; -import { isWhiteSpace, MissingDataException } from "./core_utils.js"; +import { + isWhiteSpace, + MissingDataException, + ParserEOFException, +} from "./core_utils.js"; import { Ascii85Stream } from "./ascii_85_stream.js"; import { AsciiHexStream } from "./ascii_hex_stream.js"; import { CCITTFaxStream } from "./ccitt_stream.js"; @@ -124,10 +128,10 @@ class Parser { array.push(this.getObj(cipherTransform)); } if (isEOF(this.buf1)) { - if (!this.recoveryMode) { - throw new FormatError("End of file inside array"); + if (this.recoveryMode) { + return array; } - return array; + throw new ParserEOFException("End of file inside array."); } this.shift(); return array; @@ -148,10 +152,10 @@ class Parser { dict.set(key, this.getObj(cipherTransform)); } if (isEOF(this.buf1)) { - if (!this.recoveryMode) { - throw new FormatError("End of file inside dictionary"); + if (this.recoveryMode) { + return dict; } - return dict; + throw new ParserEOFException("End of file inside dictionary."); } // Stream objects are not allowed inside content streams or diff --git a/src/core/xref.js b/src/core/xref.js index 33c9bc3e2..401fb0024 100644 --- a/src/core/xref.js +++ b/src/core/xref.js @@ -33,6 +33,7 @@ import { import { Lexer, Parser } from "./parser.js"; import { MissingDataException, + ParserEOFException, XRefEntryException, XRefParseException, } from "./core_utils.js"; @@ -453,15 +454,38 @@ class XRef { } else if ((m = objRegExp.exec(token))) { const num = m[1] | 0, gen = m[2] | 0; - if (!this.entries[num] || this.entries[num].gen === gen) { + + let contentLength, + startPos = position + token.length, + updateEntries = false; + if (!this.entries[num]) { + updateEntries = true; + } else if (this.entries[num].gen === gen) { + // Before overwriting an existing entry, ensure that the new one won't + // cause *immediate* errors when it's accessed (fixes issue13783.pdf). + try { + const parser = new Parser({ + lexer: new Lexer(stream.makeSubStream(startPos)), + }); + parser.getObj(); + updateEntries = true; + } catch (ex) { + if (ex instanceof ParserEOFException) { + warn(`indexObjects -- checking object (${token}): "${ex}".`); + } else { + // The error may come from the `Parser`-instance being initialized + // without an `XRef`-instance (we don't have a usable one yet). + updateEntries = true; + } + } + } + if (updateEntries) { this.entries[num] = { offset: position - stream.start, gen, uncompressed: true, }; } - let contentLength, - startPos = position + token.length; // Find the next "obj" string, rather than "endobj", to ensure that // we won't skip over a new 'obj' operator in corrupt files where diff --git a/test/pdfs/issue13783.pdf.link b/test/pdfs/issue13783.pdf.link new file mode 100644 index 000000000..f6fc07445 --- /dev/null +++ b/test/pdfs/issue13783.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/6869824/TimeTravel.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index afdba3c79..e28e30361 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -1382,6 +1382,15 @@ "enableXfa": true, "type": "eq" }, + { "id": "issue13783", + "file": "pdfs/issue13783.pdf", + "md5": "6958d827afa566efbd82f53271ea5cd6", + "link": true, + "rounds": 1, + "firstPage": 7, + "lastPage": 7, + "type": "eq" + }, { "id": "issue9262", "file": "pdfs/issue9262_reduced.pdf", "md5": "5347ce2d7b3866625c22e115fd90e0de",