From 2d8d8b5e538544ec3509f72ec6b91323f73cc5d5 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Thu, 3 Nov 2016 19:48:08 +0100
Subject: [PATCH 1/2] Use `stringToPDFString` to sanitizing bad "Prefix"
 entries in Page Label dictionaries

It seems that certain bad PDF generators can create badly encoded "Prefix" entries for Page Labels, one example being http://ukjewishfilm.org/wp-content/uploads/2015/09/Jewish-Film-Festival-Programme-ONLINE.pdf.

Unfortunately I didn't come across such a PDF file while adding the API support for Page Labels, but with them now being used in the viewer I just found this issue. With this patch, we now display the Page Labels in the same way as Adobe Reader.
---
 src/core/obj.js              |   5 +++--
 test/pdfs/.gitignore         |   1 +
 test/pdfs/bad-PageLabels.pdf | Bin 0 -> 792 bytes
 test/unit/api_spec.js        |  12 +++++++++++-
 4 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 test/pdfs/bad-PageLabels.pdf

diff --git a/src/core/obj.js b/src/core/obj.js
index 4d3c18735..7196797ba 100644
--- a/src/core/obj.js
+++ b/src/core/obj.js
@@ -302,8 +302,9 @@ var Catalog = (function CatalogClosure() {
           assert(!s || isName(s), 'Invalid style in PageLabel dictionary.');
           style = (s ? s.name : null);
 
-          prefix = labelDict.get('P') || '';
-          assert(isString(prefix), 'Invalid prefix in PageLabel dictionary.');
+          var p = labelDict.get('P') || '';
+          assert(isString(p), 'Invalid prefix in PageLabel dictionary.');
+          prefix = stringToPDFString(p);
 
           start = labelDict.get('St') || 1;
           assert(isInt(start), 'Invalid start in PageLabel dictionary.');
diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore
index f07e5921f..ecfb1700d 100644
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@@ -40,6 +40,7 @@
 !issue7544.pdf
 !issue7598.pdf
 !issue7665.pdf
+!bad-PageLabels.pdf
 !filled-background.pdf
 !ArabicCIDTrueType.pdf
 !ThuluthFeatures.pdf
diff --git a/test/pdfs/bad-PageLabels.pdf b/test/pdfs/bad-PageLabels.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..8cafbcd102f135c177b7da52818b56bc58c30283
GIT binary patch
literal 792
zcmZWn!A{#i5bb%sVlI*3K-QbYNstf{A;|^WM!`~%5Qj~?35()g<h51WpXAU#X#WG4
zSvQLnEFbKr_h#Okna*N5Kat%LbpHN(`2!LYbgM^%vojD2wbdSb1TCANYqiolkNpP8
zem!Xvm#R`bw}ta_&^8ZRR>UG#Cii$ZV97m*OIO<pC5T2GI<%$Ci_(QEHL?F2_^NZG
zChi&U5+e1I1k;2nF12@cxhA8*^W2f5`I4A&S1wO4wYC+N^EeD>%R$=!OQL~^@;Ve0
znJKhCS-L`55I6Oz3fVaXWtrKv%Z=Tl_+jjY^=1$3V{(ml6Lt`rTx+|n9&ng4!&jwN
z1xzxKoDs7rh`B`B!)yabv@6Wrx-NJuKMko?Dn}k7Hmm6#WA+HM40sn!EsSg7wZ*RA
z>Vnpi;E0ED-!{icq%H91_3g{;F@;7kZPA`fy@3SBki>@~r2P~(7%@rH<Ol5sHl`IH
zV1rXSy8}!P$nF8w8__8oVExf&UfQP%+4HJWnVl{n+{Ds3x;~Cffy{4B{=?8!S%pC9
Ma15Q!?E4(v0W?3&djJ3c

literal 0
HcmV?d00001

diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
index 663193b11..2e50bd7e7 100644
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@@ -503,14 +503,24 @@ describe('api', function() {
         return pdfDoc.getPageLabels();
       });
 
-      Promise.all([promise0, promise1, promise2]).then(function (pageLabels) {
+      // PageLabels with bad "Prefix" entries.
+      var url3 = new URL('../pdfs/bad-PageLabels.pdf', window.location).href;
+      var loadingTask3 = new PDFJS.getDocument(url3);
+      var promise3 = loadingTask3.promise.then(function (pdfDoc) {
+        return pdfDoc.getPageLabels();
+      });
+
+      Promise.all([promise0, promise1, promise2, promise3]).then(
+          function (pageLabels) {
         expect(pageLabels[0]).toEqual(['i', 'ii', 'iii', '1']);
         expect(pageLabels[1]).toEqual(['Front Page1']);
         expect(pageLabels[2]).toEqual(['1', '2']);
+        expect(pageLabels[3]).toEqual(['X1']);
 
         loadingTask0.destroy();
         loadingTask1.destroy();
         loadingTask2.destroy();
+        loadingTask3.destroy();
         done();
       }).catch(function (reason) {
         done.fail(reason);

From 0844a72b4dd84852f4b9cf9d341610b015161195 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Thu, 3 Nov 2016 20:08:06 +0100
Subject: [PATCH 2/2] Add a bit more validation to `Catalog_readPageLabels`, to
 ensure that the Page Labels are well formed

---
 src/core/obj.js | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/core/obj.js b/src/core/obj.js
index 7196797ba..19de8dffa 100644
--- a/src/core/obj.js
+++ b/src/core/obj.js
@@ -283,7 +283,6 @@ var Catalog = (function CatalogClosure() {
       var pageLabels = new Array(this.numPages);
       var style = null;
       var prefix = '';
-      var start = 1;
 
       var numberTree = new NumberTree(obj, this.xref);
       var nums = numberTree.getAll();
@@ -300,15 +299,16 @@ var Catalog = (function CatalogClosure() {
 
           var s = labelDict.get('S');
           assert(!s || isName(s), 'Invalid style in PageLabel dictionary.');
-          style = (s ? s.name : null);
+          style = s ? s.name : null;
 
-          var p = labelDict.get('P') || '';
-          assert(isString(p), 'Invalid prefix in PageLabel dictionary.');
-          prefix = stringToPDFString(p);
+          var p = labelDict.get('P');
+          assert(!p || isString(p), 'Invalid prefix in PageLabel dictionary.');
+          prefix = p ? stringToPDFString(p) : '';
 
-          start = labelDict.get('St') || 1;
-          assert(isInt(start), 'Invalid start in PageLabel dictionary.');
-          currentIndex = start;
+          var st = labelDict.get('St');
+          assert(!st || (isInt(st) && st >= 1),
+                 'Invalid start in PageLabel dictionary.');
+          currentIndex = st || 1;
         }
 
         switch (style) {