Merge pull request #520 from notmasteryet/invalidpdf-1

Recovering from the invalid PDF
2011-09-26 10:40:19 -07:00 · 2011-09-26 10:40:19 -07:00 · 3745ce9480
commit 3745ce9480
parent 805fa3b291 39ba5324a7
4 changed files with 123 additions and 2 deletions
--- a/pdf.js
+++ b/pdf.js
@ -3164,6 +3164,113 @@ var XRef = (function xRefXRef() {
        this.readXRef(prev);
      return streamParameters;
    },
+    indexObjects: function indexObjects() {
+      // Simple scan through the PDF content to find objects,
+      // trailers and XRef streams.
+      function readToken(data, offset) {
+        var token = '', ch = data[offset];
+        while (ch !== 13 && ch !== 10) {
+          if (++offset >= data.length)
+            break;
+          token += String.fromCharCode(ch);
+          ch = data[offset];
+        }
+        return token;
+      }
+      function skipUntil(data, offset, what) {
+        var length = what.length, dataLength = data.length;
+        var skipped = 0;
+        // finding byte sequence
+        while (offset < dataLength) {
+          var i = 0;
+          while (i < length && data[offset + i] == what[i])
+            ++i;
+          if (i >= length)
+            break; // sequence found
+
+          offset++;
+          skipped++;
+        }
+        return skipped;
+      }
+      var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
+      var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114,
+                                          101, 102]);
+      var endobjBytes = new Uint8Array([101, 110, 100, 111, 98, 106]);
+      var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
+
+      var stream = this.stream;
+      stream.pos = 0;
+      var buffer = stream.getBytes();
+      var position = 0, length = buffer.length;
+      var trailers = [], xrefStms = [];
+      var state = 0;
+      var currentToken;
+      while (position < length) {
+        var ch = buffer[position];
+        if (ch === 32 || ch === 9 || ch === 13 || ch === 10) {
+          ++position;
+          continue;
+        }
+        if (ch === 37) { // %-comment
+          do {
+            ++position;
+            ch = buffer[position];
+          } while (ch !== 13 && ch !== 10);
+          continue;
+        }
+        var token = readToken(buffer, position);
+        var m;
+        if (token === 'xref') {
+          position += skipUntil(buffer, position, trailerBytes);
+          trailers.push(position);
+          position += skipUntil(buffer, position, startxrefBytes);
+        } else if ((m = /^(\d+)\s+(\d+)\s+obj\b/.exec(token))) {
+          this.entries[m[1]] = {
+            offset: position,
+            gen: m[2] | 0,
+            uncompressed: true
+          };
+
+          var contentLength = skipUntil(buffer, position, endobjBytes) + 7;
+          var content = buffer.subarray(position, position + contentLength);
+
+          // checking XRef stream suspect
+          // (it shall have '/XRef' and next char is not a letter)
+          var xrefTagOffset = skipUntil(content, 0, xrefBytes);
+          if (xrefTagOffset < contentLength &&
+              content[xrefTagOffset + 5] < 64) {
+            xrefStms.push(position);
+            this.xrefstms[position] = 1; // don't read it recursively
+          }
+
+          position += contentLength;
+        } else
+          position += token.length + 1;
+      }
+      // reading XRef streams
+      for (var i = 0; i < xrefStms.length; ++i) {
+          this.readXRef(xrefStms[i]);
+      }
+      // finding main trailer
+      for (var i = 0; i < trailers.length; ++i) {
+        stream.pos = trailers[i];
+        var parser = new Parser(new Lexer(stream), true);
+        var obj = parser.getObj();
+        if (!IsCmd(obj, 'trailer'))
+          continue;
+        // read the trailer dictionary
+        var dict;
+        if (!IsDict(dict = parser.getObj()))
+          continue;
+        // taking the first one with 'ID'
+        if (dict.has('ID'))
+          return dict;
+      }
+      // nothing helps
+      error('Invalid PDF structure');
+      return null;
+    },
    readXRef: function readXref(startXRef) {
      var stream = this.stream;
      stream.pos = startXRef;
@ -3181,8 +3288,7 @@ var XRef = (function xRefXRef() {
        }
        return this.readXRefStream(obj);
      }
-      error('Invalid XRef');
-      return null;
+      return this.indexObjects();
    },
    getEntry: function xRefGetEntry(i) {
      var e = this.entries[i];
--- a/test/driver.js
+++ b/test/driver.js
@ -125,6 +125,13 @@ function nextPage(task, loadError) {
    }
  }

+  if (task.skipPages && task.skipPages.indexOf(task.pageNum) >= 0) {
+    log(' skipping page ' + task.pageNum + '/' + task.pdfDoc.numPages +
+        '... ');
+    snapshotCurrentPage(task, '');
+    return;
+  }
+
  var page = null;

  if (!failure) {
--- a/test/pdfs/ibwa-bad.pdf.link
+++ b/test/pdfs/ibwa-bad.pdf.link
@ -0,0 +1 @@
+http://www.bottledwater.org/public/pdf/IBWA05ModelCode_Mar2.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -157,5 +157,12 @@
       "link": false,
       "rounds": 1,
       "type": "load"
+    },
+    {  "id": "ibwa-bad",
+       "file": "pdfs/ibwa-bad.pdf",
+       "link": true,
+       "rounds": 1,
+       "skipPages": [ 16 ],
+       "type": "load"
    }
 ]
				`@ -0,0 +1 @@`
				`http://www.bottledwater.org/public/pdf/IBWA05ModelCode_Mar2.pdf`