From 0ffe9b9289cc5cddc0e6c4414518285c5ab31bcf Mon Sep 17 00:00:00 2001
From: Rob Wu <rob@robwu.nl>
Date: Fri, 19 Jan 2018 17:28:47 +0100
Subject: [PATCH 1/2] Remove useless test from network_utils_spec.js

Remove "returns null when content disposition is form-data".
The name of the test is already misleading: It suggests that
the return value is null if the Content-Disposition starts with
"form-data". This is not the case, anything with the "filename"
parameter is accepted.

So, to correct this, one would have to rephrase the test description to
"returns null when content disposition has no filename".
But this is already tested by the test called
"gets the filename from the response header".

So, remove the test.
---
 test/unit/network_utils_spec.js | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/test/unit/network_utils_spec.js b/test/unit/network_utils_spec.js
index b98ac5d01..d4fa96e8b 100644
--- a/test/unit/network_utils_spec.js
+++ b/test/unit/network_utils_spec.js
@@ -175,26 +175,6 @@ describe('network_utils', function() {
       })).toEqual('filename.pdf');
     });
 
-    it('returns null when content disposition is form-data', function() {
-      expect(extractFilenameFromHeader((headerName) => {
-        if (headerName === 'Content-Disposition') {
-          return 'form-data';
-        }
-      })).toBeNull();
-
-      expect(extractFilenameFromHeader((headerName) => {
-        if (headerName === 'Content-Disposition') {
-          return 'form-data; name="filename.pdf"';
-        }
-      })).toBeNull();
-
-      expect(extractFilenameFromHeader((headerName) => {
-        if (headerName === 'Content-Disposition') {
-          return 'form-data; name="filename.pdf"; filename="file.pdf"';
-        }
-      })).toEqual('file.pdf');
-    });
-
     it('only extracts filename with pdf extension', function () {
       expect(extractFilenameFromHeader((headerName) => {
         if (headerName === 'Content-Disposition') {

From a4e907169efaadec79b75ad92101e0f35312b0b4 Mon Sep 17 00:00:00 2001
From: Rob Wu <rob@robwu.nl>
Date: Fri, 19 Jan 2018 17:39:31 +0100
Subject: [PATCH 2/2] Improve correctness of Content-Disposition parser

Re-uses logic from https://github.com/Rob--W/open-in-browser/blob/9f5fcae11cf6d99c503a15894f22efdfcd2075b7/extension/content-disposition.js
which is already covered by tests: https://github.com/Rob--W/open-in-browser/commit/6f3bbb8bbfc1e3e943200fffdb68d35075e82ddd
---
 .eslintrc                          |   5 +-
 src/display/content_disposition.js | 211 +++++++++++++++++++++++++++++
 src/display/network_utils.js       |  11 +-
 test/unit/network_utils_spec.js    |  47 +++++++
 4 files changed, 268 insertions(+), 6 deletions(-)
 create mode 100644 src/display/content_disposition.js

diff --git a/.eslintrc b/.eslintrc
index 488426187..1dd046d54 100644
--- a/.eslintrc
+++ b/.eslintrc
@@ -120,7 +120,10 @@
     "keyword-spacing": ["error", { "before": true, "after": true, }],
     "linebreak-style": ["error", "unix"],
     "lines-between-class-members": ["error", "always"],
-    "max-len": ["error", 80],
+    "max-len": ["error", {
+      "code": 80,
+      "ignoreUrls": true
+    }],
     "new-cap": ["error", { "newIsCap": true, "capIsNew": false, }],
     "new-parens": "error",
     "no-array-constructor": "error",
diff --git a/src/display/content_disposition.js b/src/display/content_disposition.js
new file mode 100644
index 000000000..4b3608814
--- /dev/null
+++ b/src/display/content_disposition.js
@@ -0,0 +1,211 @@
+/* Copyright 2017 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This getFilenameFromContentDispositionHeader function is adapted from
+// https://github.com/Rob--W/open-in-browser/blob/9f5fcae11cf6d99c503a15894f22efdfcd2075b7/extension/content-disposition.js
+// with the following changes:
+// - Modified to conform to PDF.js's coding style.
+// - Support UTF-8 decoding when TextDecoder is unsupported.
+// - Replace Array.from with Array + loop for compat with old browsers.
+// - Replace "startsWith" with other string method for compat with old browsers.
+// - Move return to the end of the function to prevent Babel from dropping the
+//   function declarations.
+
+/**
+ * Extract file name from the Content-Disposition HTTP response header.
+ *
+ * @param {string} contentDisposition
+ * @return {string} Filename, if found in the Content-Disposition header.
+ */
+function getFilenameFromContentDispositionHeader(contentDisposition) {
+  let needsEncodingFixup = true;
+
+  // filename*=ext-value ("ext-value" from RFC 5987, referenced by RFC 6266).
+  let tmp = /(?:^|;)\s*filename\*\s*=\s*([^;\s]+)/i.exec(contentDisposition);
+  if (tmp) {
+    tmp = tmp[1];
+    let filename = rfc2616unquote(tmp);
+    filename = unescape(filename);
+    filename = rfc5987decode(filename);
+    filename = rfc2047decode(filename);
+    return fixupEncoding(filename);
+  }
+
+  // Continuations (RFC 2231 section 3, referenced by RFC 5987 section 3.1).
+  // filename*n*=part
+  // filename*n=part
+  tmp = rfc2231getparam(contentDisposition);
+  if (tmp) {
+    // RFC 2047, section
+    let filename = rfc2047decode(tmp);
+    return fixupEncoding(filename);
+  }
+
+  // filename=value (RFC 5987, section 4.1).
+  tmp = /(?:^|;)\s*filename\s*=\s*([^;\s]+)/.exec(contentDisposition);
+  if (tmp) {
+    tmp = tmp[1];
+    let filename = rfc2616unquote(tmp);
+    filename = rfc2047decode(filename);
+    return fixupEncoding(filename);
+  }
+
+  // After this line there are only function declarations. We cannot put
+  // "return" here for readability because babel would then drop the function
+  // declarations...
+
+  function textdecode(encoding, value) {
+    if (encoding) {
+      if (!/^[^\x00-\xFF]+$/.test(value)) {
+        return value;
+      }
+      try {
+        let decoder = new TextDecoder(encoding, { fatal: true, });
+        let bytes = new Array(value.length);
+        for (let i = 0; i < value.length; ++i) {
+          bytes[i] = value.charCodeAt(0);
+        }
+        value = decoder.decode(new Uint8Array(bytes));
+        needsEncodingFixup = false;
+      } catch (e) {
+        // TextDecoder constructor threw - unrecognized encoding.
+        // Or TextDecoder API is not available.
+        if (/^utf-?8$/i.test(encoding)) {
+          // UTF-8 is commonly used, try to support it in another way:
+          value = decodeURIComponent(escape(value));
+          needsEncodingFixup = false;
+        }
+      }
+    }
+    return value;
+  }
+  function fixupEncoding(value) {
+    if (needsEncodingFixup && /[\x80-\xff]/.test(value)) {
+      // Maybe multi-byte UTF-8.
+      return textdecode('utf-8', value);
+    }
+    return value;
+  }
+  function rfc2231getparam(contentDisposition) {
+    let matches = [], match;
+    // Iterate over all filename*n= and filename*n*= with n being an integer
+    // of at least zero. Any non-zero number must not start with '0'.
+    let iter = /(?:^|;)\s*filename\*((?!0\d)\d+)(\*?)\s*=\s*([^;\s]+)/ig;
+    while ((match = iter.exec(contentDisposition)) !== null) {
+      let [, n, quot, part] = match;
+      n = parseInt(n, 10);
+      if (n in matches) {
+        // Ignore anything after the invalid second filename*0.
+        if (n === 0) {
+          break;
+        }
+        continue;
+      }
+      matches[n] = [quot, part];
+    }
+    let parts = [];
+    for (let n = 0; n < matches.length; ++n) {
+      if (!(n in matches)) {
+        // Numbers must be consecutive. Truncate when there is a hole.
+        break;
+      }
+      let [quot, part] = matches[n];
+      part = rfc2616unquote(part);
+      if (quot) {
+        part = unescape(part);
+        if (n === 0) {
+          part = rfc5987decode(part);
+        }
+      }
+      parts.push(part);
+    }
+    return parts.join('');
+  }
+  function rfc2616unquote(value) {
+    if (value.charAt(0) === '"') {
+      let parts = value.slice(1).split('\\"');
+      // Find the first unescaped " and terminate there.
+      for (let i = 0; i < parts.length; ++i) {
+        let quotindex = parts[i].indexOf('"');
+        if (quotindex !== -1) {
+          parts[i] = parts[i].slice(0, quotindex);
+          parts.length = i + 1; // Truncates and stop the iteration.
+        }
+        parts[i] = parts[i].replace(/\\(.)/g, '$1');
+      }
+      value = parts.join('"');
+    }
+    return value;
+  }
+  function rfc5987decode(extvalue) {
+    // Decodes "ext-value" from RFC 5987.
+    let encodingend = extvalue.indexOf('\'');
+    if (encodingend === -1) {
+      // Some servers send "filename*=" without encoding'language' prefix,
+      // e.g. in https://github.com/Rob--W/open-in-browser/issues/26
+      // Let's accept the value like Firefox (57) (Chrome 62 rejects it).
+      return extvalue;
+    }
+    let encoding = extvalue.slice(0, encodingend);
+    let langvalue = extvalue.slice(encodingend + 1);
+    // Ignore language (RFC 5987 section 3.2.1, and RFC 6266 section 4.1 ).
+    let value = langvalue.replace(/^[^']*'/, '');
+    return textdecode(encoding, value);
+  }
+  function rfc2047decode(value) {
+    // RFC 2047-decode the result. Firefox tried to drop support for it, but
+    // backed out because some servers use it - https://bugzil.la/875615
+    // Firefox's condition for decoding is here: https://searchfox.org/mozilla-central/rev/4a590a5a15e35d88a3b23dd6ac3c471cf85b04a8/netwerk/mime/nsMIMEHeaderParamImpl.cpp#742-748
+
+    // We are more strict and only recognize RFC 2047-encoding if the value
+    // starts with "=?", since then it is likely that the full value is
+    // RFC 2047-encoded.
+
+    // Firefox also decodes words even where RFC 2047 section 5 states:
+    // "An 'encoded-word' MUST NOT appear within a 'quoted-string'."
+    if (value.slice(0, 2) !== '=?' || /[\x00-\x19\x80-\xff]/.test(value)) {
+      return value;
+    }
+    // RFC 2047, section 2.4
+    // encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
+    // charset = token (but let's restrict to characters that denote a
+    //       possibly valid encoding).
+    // encoding = q or b
+    // encoded-text = any printable ASCII character other than ? or space.
+    //        ... but Firefox permits ? and space.
+    return value.replace(/=\?([\w\-]*)\?([QqBb])\?((?:[^?]|\?(?!=))*)\?=/g,
+      function(_, charset, encoding, text) {
+        if (encoding === 'q' || encoding === 'Q') {
+          // RFC 2047 section 4.2.
+          text = text.replace(/_/g, ' ');
+          text = text.replace(/=([0-9a-fA-F]{2})/g, function(_, hex) {
+            return String.fromCharCode(parseInt(hex, 16));
+          });
+          return textdecode(charset, text);
+        } // else encoding is b or B - base64 (RFC 2047 section 4.1)
+        try {
+          return atob(text);
+        } catch (e) {
+          return text;
+        }
+      });
+  }
+
+  return '';
+}
+
+export {
+  getFilenameFromContentDispositionHeader,
+};
diff --git a/src/display/network_utils.js b/src/display/network_utils.js
index 1b0eb0ee9..be55e4129 100644
--- a/src/display/network_utils.js
+++ b/src/display/network_utils.js
@@ -16,7 +16,9 @@
 import {
   assert, MissingPDFException, UnexpectedResponseException
 } from '../shared/util';
-import { getFilenameFromUrl } from './dom_utils';
+import {
+  getFilenameFromContentDispositionHeader
+} from './content_disposition';
 
 function validateRangeRequestCapabilities({ getResponseHeader, isHttp,
                                             rangeChunkSize, disableRange, }) {
@@ -56,10 +58,9 @@ function validateRangeRequestCapabilities({ getResponseHeader, isHttp,
 function extractFilenameFromHeader(getResponseHeader) {
   const contentDisposition = getResponseHeader('Content-Disposition');
   if (contentDisposition) {
-    let parts =
-      /.+;\s*filename=(?:'|")(.+\.pdf)(?:'|")/gi.exec(contentDisposition);
-    if (parts !== null && parts.length > 1) {
-      return getFilenameFromUrl(parts[1]);
+    let filename = getFilenameFromContentDispositionHeader(contentDisposition);
+    if (/\.pdf$/i.test(filename)) {
+      return filename;
     }
   }
   return null;
diff --git a/test/unit/network_utils_spec.js b/test/unit/network_utils_spec.js
index d4fa96e8b..56de74408 100644
--- a/test/unit/network_utils_spec.js
+++ b/test/unit/network_utils_spec.js
@@ -173,6 +173,53 @@ describe('network_utils', function() {
           return 'attachment; filename="filename.pdf"';
         }
       })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename=filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+    });
+
+    it('gets the filename from the response header (RFC 6266)', function() {
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*=filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*=\'\'filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*=utf-8\'\'filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename=no.pdf; filename*=utf-8\'\'filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*=utf-8\'\'filename.pdf; filename=no.pdf';
+        }
+      })).toEqual('filename.pdf');
+    });
+
+    it('gets the filename from the response header (RFC 2231)', function() {
+      // Tests continuations (RFC 2231 section 3, via RFC 5987 section 3.1).
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*0=filename; filename*1=.pdf';
+        }
+      })).toEqual('filename.pdf');
     });
 
     it('only extracts filename with pdf extension', function () {