diff --git a/src/core/xref.js b/src/core/xref.js index ace419657..a15cf4d09 100644 --- a/src/core/xref.js +++ b/src/core/xref.js @@ -431,16 +431,14 @@ class XRef { } return skipped; } + const gEndobjRegExp = /\b(endobj|\d+\s+\d+\s+obj|xref|trailer)\b/g; + const gStartxrefRegExp = /\b(startxref|\d+\s+\d+\s+obj)\b/g; const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/; - const endobjRegExp = /\bendobj[\b\s]$/; - const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s<])$/; - const CHECK_CONTENT_LENGTH = 25; const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]); const startxrefBytes = new Uint8Array([ 115, 116, 97, 114, 116, 120, 114, 101, 102, ]); - const objBytes = new Uint8Array([111, 98, 106]); const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]); // Clear out any existing entries, since they may be bogus. @@ -450,6 +448,7 @@ class XRef { const stream = this.stream; stream.pos = 0; const buffer = stream.getBytes(), + bufferStr = bytesToString(buffer), length = buffer.length; let position = stream.start; const trailers = [], @@ -484,8 +483,8 @@ class XRef { const num = m[1] | 0, gen = m[2] | 0; + const startPos = position + token.length; let contentLength, - startPos = position + token.length, updateEntries = false; if (!this.entries[num]) { updateEntries = true; @@ -519,31 +518,22 @@ class XRef { // Find the next "obj" string, rather than "endobj", to ensure that // we won't skip over a new 'obj' operator in corrupt files where // 'endobj' operators are missing (fixes issue9105_reduced.pdf). - while (startPos < length) { - const endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4; + gEndobjRegExp.lastIndex = startPos; + const match = gEndobjRegExp.exec(bufferStr); + + if (match) { + const endPos = gEndobjRegExp.lastIndex + 1; contentLength = endPos - position; - const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos); - const tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); - - // Check if the current object ends with an 'endobj' operator. - if (endobjRegExp.test(tokenStr)) { - break; - } else { - // Check if an "obj" occurrence is actually a new object, - // i.e. the current object is missing the 'endobj' operator. - const objToken = nestedObjRegExp.exec(tokenStr); - - if (objToken && objToken[1]) { - warn( - 'indexObjects: Found new "obj" inside of another "obj", ' + - 'caused by missing "endobj" -- trying to recover.' - ); - contentLength -= objToken[1].length; - break; - } + if (match[1] !== "endobj") { + warn( + `indexObjects: Found "${match[1]}" inside of another "obj", ` + + 'caused by missing "endobj" -- trying to recover.' + ); + contentLength -= match[1].length + 1; } - startPos = endPos; + } else { + contentLength = length - position; } const content = buffer.subarray(position, position + contentLength); @@ -562,26 +552,26 @@ class XRef { ) { trailers.push(position); - const contentLength = skipUntil(buffer, position, startxrefBytes); + const startPos = position + token.length; + let contentLength; // Attempt to handle (some) corrupt documents, where no 'startxref' // operators are present (fixes issue15590.pdf). - if (position + contentLength >= length) { - const endPos = position + skipUntil(buffer, position, objBytes) + 4; + gStartxrefRegExp.lastIndex = startPos; + const match = gStartxrefRegExp.exec(bufferStr); - const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, position); - const tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); + if (match) { + const endPos = gStartxrefRegExp.lastIndex + 1; + contentLength = endPos - position; - // Find the first "obj" occurrence after the 'trailer' operator. - const objToken = nestedObjRegExp.exec(tokenStr); - - if (objToken && objToken[1]) { + if (match[1] !== "startxref") { warn( - 'indexObjects: Found first "obj" after "trailer", ' + + `indexObjects: Found "${match[1]}" after "trailer", ` + 'caused by missing "startxref" -- trying to recover.' ); - position = endPos - objToken[1].length; - continue; + contentLength -= match[1].length + 1; } + } else { + contentLength = length - position; } position += contentLength; } else { diff --git a/test/pdfs/issue15803.pdf.link b/test/pdfs/issue15803.pdf.link new file mode 100644 index 000000000..7b1e0c0d7 --- /dev/null +++ b/test/pdfs/issue15803.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/10200431/ocg.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index 3bbe21a2d..70f277edd 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -1761,6 +1761,15 @@ "link": false, "type": "eq" }, + { "id": "issue15803", + "file": "pdfs/issue15803.pdf", + "md5": "e501a4418d4ece5be7ce4e8acf029100", + "rounds": 1, + "link": true, + "lastPage": 1, + "type": "eq", + "annotations": true + }, { "id": "issue9105_other", "file": "pdfs/issue9105_other.pdf", "md5": "4c8b9c2cceb9c5d621e1d50b3dc38efc",