Re-factor searching for incomplete objects in XRef.indexObjects (issue 15803)
				
					
				
			When trying to find incomplete objects, i.e. those missing the "endobj"-string at the end, there's unfortunately a number of possible operators that we need to check for. Otherwise we could miss e.g. the "trailer" at the end of a corrupt PDF document, which is why the referenced document didn't work. Currently we do all searching on the "raw" bytes of the PDF document, for efficiency, however this doesn't really work when we need to check for *multiple* potential command-strings. To keep the complexity manageable we'll instead use regular expressions here, but we can at least avoid creating lots of substrings thanks to the `RegExp.lastIndex` property; which is well supported across browsers according to https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/lastIndex#browser_compatibility Note that this repeated regular expression usage could perhaps be slightly less efficient than the old code, however this method is only invoked for corrupt PDF documents.
This commit is contained in:
		
							parent
							
								
									6a9a567670
								
							
						
					
					
						commit
						2fcf8bb5be
					
				| @ -431,16 +431,14 @@ class XRef { | |||||||
|       } |       } | ||||||
|       return skipped; |       return skipped; | ||||||
|     } |     } | ||||||
|  |     const gEndobjRegExp = /\b(endobj|\d+\s+\d+\s+obj|xref|trailer)\b/g; | ||||||
|  |     const gStartxrefRegExp = /\b(startxref|\d+\s+\d+\s+obj)\b/g; | ||||||
|     const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/; |     const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/; | ||||||
|     const endobjRegExp = /\bendobj[\b\s]$/; |  | ||||||
|     const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s<])$/; |  | ||||||
|     const CHECK_CONTENT_LENGTH = 25; |  | ||||||
| 
 | 
 | ||||||
|     const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]); |     const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]); | ||||||
|     const startxrefBytes = new Uint8Array([ |     const startxrefBytes = new Uint8Array([ | ||||||
|       115, 116, 97, 114, 116, 120, 114, 101, 102, |       115, 116, 97, 114, 116, 120, 114, 101, 102, | ||||||
|     ]); |     ]); | ||||||
|     const objBytes = new Uint8Array([111, 98, 106]); |  | ||||||
|     const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]); |     const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]); | ||||||
| 
 | 
 | ||||||
|     // Clear out any existing entries, since they may be bogus.
 |     // Clear out any existing entries, since they may be bogus.
 | ||||||
| @ -450,6 +448,7 @@ class XRef { | |||||||
|     const stream = this.stream; |     const stream = this.stream; | ||||||
|     stream.pos = 0; |     stream.pos = 0; | ||||||
|     const buffer = stream.getBytes(), |     const buffer = stream.getBytes(), | ||||||
|  |       bufferStr = bytesToString(buffer), | ||||||
|       length = buffer.length; |       length = buffer.length; | ||||||
|     let position = stream.start; |     let position = stream.start; | ||||||
|     const trailers = [], |     const trailers = [], | ||||||
| @ -484,8 +483,8 @@ class XRef { | |||||||
|         const num = m[1] | 0, |         const num = m[1] | 0, | ||||||
|           gen = m[2] | 0; |           gen = m[2] | 0; | ||||||
| 
 | 
 | ||||||
|  |         const startPos = position + token.length; | ||||||
|         let contentLength, |         let contentLength, | ||||||
|           startPos = position + token.length, |  | ||||||
|           updateEntries = false; |           updateEntries = false; | ||||||
|         if (!this.entries[num]) { |         if (!this.entries[num]) { | ||||||
|           updateEntries = true; |           updateEntries = true; | ||||||
| @ -519,31 +518,22 @@ class XRef { | |||||||
|         // Find the next "obj" string, rather than "endobj", to ensure that
 |         // Find the next "obj" string, rather than "endobj", to ensure that
 | ||||||
|         // we won't skip over a new 'obj' operator in corrupt files where
 |         // we won't skip over a new 'obj' operator in corrupt files where
 | ||||||
|         // 'endobj' operators are missing (fixes issue9105_reduced.pdf).
 |         // 'endobj' operators are missing (fixes issue9105_reduced.pdf).
 | ||||||
|         while (startPos < length) { |         gEndobjRegExp.lastIndex = startPos; | ||||||
|           const endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4; |         const match = gEndobjRegExp.exec(bufferStr); | ||||||
|  | 
 | ||||||
|  |         if (match) { | ||||||
|  |           const endPos = gEndobjRegExp.lastIndex + 1; | ||||||
|           contentLength = endPos - position; |           contentLength = endPos - position; | ||||||
| 
 | 
 | ||||||
|           const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos); |           if (match[1] !== "endobj") { | ||||||
|           const tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); |             warn( | ||||||
| 
 |               `indexObjects: Found "${match[1]}" inside of another "obj", ` + | ||||||
|           // Check if the current object ends with an 'endobj' operator.
 |                 'caused by missing "endobj" -- trying to recover.' | ||||||
|           if (endobjRegExp.test(tokenStr)) { |             ); | ||||||
|             break; |             contentLength -= match[1].length + 1; | ||||||
|           } else { |  | ||||||
|             // Check if an "obj" occurrence is actually a new object,
 |  | ||||||
|             // i.e. the current object is missing the 'endobj' operator.
 |  | ||||||
|             const objToken = nestedObjRegExp.exec(tokenStr); |  | ||||||
| 
 |  | ||||||
|             if (objToken && objToken[1]) { |  | ||||||
|               warn( |  | ||||||
|                 'indexObjects: Found new "obj" inside of another "obj", ' + |  | ||||||
|                   'caused by missing "endobj" -- trying to recover.' |  | ||||||
|               ); |  | ||||||
|               contentLength -= objToken[1].length; |  | ||||||
|               break; |  | ||||||
|             } |  | ||||||
|           } |           } | ||||||
|           startPos = endPos; |         } else { | ||||||
|  |           contentLength = length - position; | ||||||
|         } |         } | ||||||
|         const content = buffer.subarray(position, position + contentLength); |         const content = buffer.subarray(position, position + contentLength); | ||||||
| 
 | 
 | ||||||
| @ -562,26 +552,26 @@ class XRef { | |||||||
|       ) { |       ) { | ||||||
|         trailers.push(position); |         trailers.push(position); | ||||||
| 
 | 
 | ||||||
|         const contentLength = skipUntil(buffer, position, startxrefBytes); |         const startPos = position + token.length; | ||||||
|  |         let contentLength; | ||||||
|         // Attempt to handle (some) corrupt documents, where no 'startxref'
 |         // Attempt to handle (some) corrupt documents, where no 'startxref'
 | ||||||
|         // operators are present (fixes issue15590.pdf).
 |         // operators are present (fixes issue15590.pdf).
 | ||||||
|         if (position + contentLength >= length) { |         gStartxrefRegExp.lastIndex = startPos; | ||||||
|           const endPos = position + skipUntil(buffer, position, objBytes) + 4; |         const match = gStartxrefRegExp.exec(bufferStr); | ||||||
| 
 | 
 | ||||||
|           const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, position); |         if (match) { | ||||||
|           const tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); |           const endPos = gStartxrefRegExp.lastIndex + 1; | ||||||
|  |           contentLength = endPos - position; | ||||||
| 
 | 
 | ||||||
|           // Find the first "obj" occurrence after the 'trailer' operator.
 |           if (match[1] !== "startxref") { | ||||||
|           const objToken = nestedObjRegExp.exec(tokenStr); |  | ||||||
| 
 |  | ||||||
|           if (objToken && objToken[1]) { |  | ||||||
|             warn( |             warn( | ||||||
|               'indexObjects: Found first "obj" after "trailer", ' + |               `indexObjects: Found "${match[1]}" after "trailer", ` + | ||||||
|                 'caused by missing "startxref" -- trying to recover.' |                 'caused by missing "startxref" -- trying to recover.' | ||||||
|             ); |             ); | ||||||
|             position = endPos - objToken[1].length; |             contentLength -= match[1].length + 1; | ||||||
|             continue; |  | ||||||
|           } |           } | ||||||
|  |         } else { | ||||||
|  |           contentLength = length - position; | ||||||
|         } |         } | ||||||
|         position += contentLength; |         position += contentLength; | ||||||
|       } else { |       } else { | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								test/pdfs/issue15803.pdf.link
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/issue15803.pdf.link
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | |||||||
|  | https://github.com/mozilla/pdf.js/files/10200431/ocg.pdf | ||||||
| @ -1761,6 +1761,15 @@ | |||||||
|        "link": false, |        "link": false, | ||||||
|        "type": "eq" |        "type": "eq" | ||||||
|     }, |     }, | ||||||
|  |     {  "id": "issue15803", | ||||||
|  |        "file": "pdfs/issue15803.pdf", | ||||||
|  |        "md5": "e501a4418d4ece5be7ce4e8acf029100", | ||||||
|  |        "rounds": 1, | ||||||
|  |        "link": true, | ||||||
|  |        "lastPage": 1, | ||||||
|  |        "type": "eq", | ||||||
|  |        "annotations": true | ||||||
|  |     }, | ||||||
|     {  "id": "issue9105_other", |     {  "id": "issue9105_other", | ||||||
|        "file": "pdfs/issue9105_other.pdf", |        "file": "pdfs/issue9105_other.pdf", | ||||||
|        "md5": "4c8b9c2cceb9c5d621e1d50b3dc38efc", |        "md5": "4c8b9c2cceb9c5d621e1d50b3dc38efc", | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user