Merge pull request #1148 from arturadib/readXRefTable

Rewrite of readXRefTable [obj.js]
2012-02-01 04:40:22 -08:00 · 2012-02-01 04:40:22 -08:00 · ff01faa86f
commit ff01faa86f
parent 13f207424c 775290d698
1 changed files with 86 additions and 69 deletions
--- a/src/obj.js
+++ b/src/obj.js
@ -287,74 +287,69 @@ var XRef = (function XRefClosure() {

  XRef.prototype = {
    readXRefTable: function readXRefTable(parser) {
+      // Example of cross-reference table:
+      // xref
+      // 0 1                    <-- subsection header (first obj #, obj count)
+      // 0000000000 65535 f     <-- actual object (offset, generation #, f/n)
+      // 23 2                   <-- subsection header ... and so on ...
+      // 0000025518 00002 n
+      // 0000025635 00000 n
+      // trailer
+      // ...
+
+      // Outer loop is over subsection headers
      var obj;
-      while (true) {
-        if (isCmd(obj = parser.getObj(), 'trailer'))
-          break;
-        if (!isInt(obj))
-          error('Invalid XRef table');
-        var first = obj;
-        if (!isInt(obj = parser.getObj()))
-          error('Invalid XRef table');
-        var n = obj;
-        if (first < 0 || n < 0 || (first + n) != ((first + n) | 0))
-          error('Invalid XRef table: ' + first + ', ' + n);
-        for (var i = first; i < first + n; ++i) {
+      while (!isCmd(obj = parser.getObj(), 'trailer')) {
+        var first = obj,
+            count = parser.getObj();
+
+        if (!isInt(first) || !isInt(count))
+          error('Invalid XRef table: wrong types in subsection header');
+
+        // Inner loop is over objects themselves
+        for (var i = 0; i < count; i++) {
          var entry = {};
-          if (!isInt(obj = parser.getObj()))
-            error('Invalid XRef table: ' + first + ', ' + n);
-          entry.offset = obj;
-          if (!isInt(obj = parser.getObj()))
-            error('Invalid XRef table: ' + first + ', ' + n);
-          entry.gen = obj;
-          obj = parser.getObj();
-          if (isCmd(obj, 'n')) {
-            entry.uncompressed = true;
-          } else if (isCmd(obj, 'f')) {
+          entry.offset = parser.getObj();
+          entry.gen = parser.getObj();
+          var type = parser.getObj();
+
+          if (isCmd(type, 'f'))
            entry.free = true;
-          } else {
-            error('Invalid XRef table: ' + first + ', ' + n);
-          }
-          if (!this.entries[i]) {
-            // In some buggy PDF files the xref table claims to start at 1
-            // instead of 0.
-            if (i == 1 && first == 1 &&
-                entry.offset == 0 && entry.gen == 65535 && entry.free) {
-              i = first = 0;
-            }
-            this.entries[i] = entry;
+          else if (isCmd(type, 'n'))
+            entry.uncompressed = true;
+
+          // Validate entry obj
+          if (!isInt(entry.offset) || !isInt(entry.gen) ||
+              !(entry.free || entry.uncompressed)) {
+            error('Invalid entry in XRef subsection: ' + first + ', ' + count);
          }
+
+          if (!this.entries[i + first])
+            this.entries[i + first] = entry;
        }
      }

-      // read the trailer dictionary
-      var dict;
-      if (!isDict(dict = parser.getObj()))
-        error('Invalid XRef table');
+      // Sanity check: as per spec, first object must have these properties
+      if (this.entries[0] &&
+          !(this.entries[0].gen === 65535 && this.entries[0].free))
+        error('Invalid XRef table: unexpected first object');

-      // get the 'Prev' pointer
-      var prev;
-      obj = dict.get('Prev');
-      if (isInt(obj)) {
-        prev = obj;
-      } else if (isRef(obj)) {
-        // certain buggy PDF generators generate "/Prev NNN 0 R" instead
-        // of "/Prev NNN"
-        prev = obj.num;
-      }
-      if (prev) {
-        this.readXRef(prev);
-      }
+      // Sanity check
+      if (!isCmd(obj, 'trailer'))
+        error('Invalid XRef table: could not find trailer dictionary');

-      // check for 'XRefStm' key
-      if (isInt(obj = dict.get('XRefStm'))) {
-        var pos = obj;
-        // ignore previously loaded xref streams (possible infinite recursion)
-        if (!(pos in this.xrefstms)) {
-          this.xrefstms[pos] = 1;
-          this.readXRef(pos);
-        }
-      }
+      // Read trailer dictionary, e.g.
+      // trailer
+      //    << /Size 22
+      //      /Root 20R
+      //      /Info 10R
+      //      /ID [ <81b14aafa313db63dbd6f981e49f94f4> ]
+      //    >>
+      // The parser goes through the entire stream << ... >> and provides
+      // a getter interface for the key-value table
+      var dict = parser.getObj();
+      if (!isDict(dict))
+        error('Invalid XRef table: could not parse trailer dictionary');

      return dict;
    },
@ -407,9 +402,6 @@ var XRef = (function XRefClosure() {
        }
        range.splice(0, 2);
      }
-      var prev = streamParameters.get('Prev');
-      if (isInt(prev))
-        this.readXRef(prev);
      return streamParameters;
    },
    indexObjects: function indexObjects() {
@ -529,22 +521,47 @@ var XRef = (function XRefClosure() {
      try {
        var parser = new Parser(new Lexer(stream), true);
        var obj = parser.getObj();
+        var dict;

-        // parse an old-style xref table
-        if (isCmd(obj, 'xref'))
-          return this.readXRefTable(parser);
+        // Get dictionary
+        if (isCmd(obj, 'xref')) {
+          // Parse end-of-file XRef
+          dict = this.readXRefTable(parser);

-        // parse an xref stream
-        if (isInt(obj)) {
+          // Recursively get other XRefs 'XRefStm', if any
+          obj = dict.get('XRefStm');
+          if (isInt(obj)) {
+            var pos = obj;
+            // ignore previously loaded xref streams
+            // (possible infinite recursion)
+            if (!(pos in this.xrefstms)) {
+              this.xrefstms[pos] = 1;
+              this.readXRef(pos);
+            }
+          }
+        } else if (isInt(obj)) {
+          // Parse in-stream XRef
          if (!isInt(parser.getObj()) ||
              !isCmd(parser.getObj(), 'obj') ||
              !isStream(obj = parser.getObj())) {
            error('Invalid XRef stream');
          }
-          return this.readXRefStream(obj);
+          dict = this.readXRefStream(obj);
        }
+
+        // Recursively get previous dictionary, if any
+        obj = dict.get('Prev');
+        if (isInt(obj))
+          this.readXRef(obj);
+        else if (isRef(obj)) {
+          // The spec says Prev must not be a reference, i.e. "/Prev NNN"
+          // This is a fallback for non-compliant PDFs, i.e. "/Prev NNN 0 R"
+          this.readXRef(obj.num);
+        }
+
+        return dict;
      } catch (e) {
-        log('Reading of the xref table/stream failed: ' + e);
+        log('(while reading XRef): ' + e);
      }

      warn('Indexing all PDF objects');