From a4e907169efaadec79b75ad92101e0f35312b0b4 Mon Sep 17 00:00:00 2001
From: Rob Wu <rob@robwu.nl>
Date: Fri, 19 Jan 2018 17:39:31 +0100
Subject: [PATCH] Improve correctness of Content-Disposition parser

Re-uses logic from https://github.com/Rob--W/open-in-browser/blob/9f5fcae11cf6d99c503a15894f22efdfcd2075b7/extension/content-disposition.js
which is already covered by tests: https://github.com/Rob--W/open-in-browser/commit/6f3bbb8bbfc1e3e943200fffdb68d35075e82ddd
---
 .eslintrc                          |   5 +-
 src/display/content_disposition.js | 211 +++++++++++++++++++++++++++++
 src/display/network_utils.js       |  11 +-
 test/unit/network_utils_spec.js    |  47 +++++++
 4 files changed, 268 insertions(+), 6 deletions(-)
 create mode 100644 src/display/content_disposition.js

diff --git a/.eslintrc b/.eslintrc
index 488426187..1dd046d54 100644
--- a/.eslintrc
+++ b/.eslintrc
@@ -120,7 +120,10 @@
     "keyword-spacing": ["error", { "before": true, "after": true, }],
     "linebreak-style": ["error", "unix"],
     "lines-between-class-members": ["error", "always"],
-    "max-len": ["error", 80],
+    "max-len": ["error", {
+      "code": 80,
+      "ignoreUrls": true
+    }],
     "new-cap": ["error", { "newIsCap": true, "capIsNew": false, }],
     "new-parens": "error",
     "no-array-constructor": "error",
diff --git a/src/display/content_disposition.js b/src/display/content_disposition.js
new file mode 100644
index 000000000..4b3608814
--- /dev/null
+++ b/src/display/content_disposition.js
@@ -0,0 +1,211 @@
+/* Copyright 2017 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This getFilenameFromContentDispositionHeader function is adapted from
+// https://github.com/Rob--W/open-in-browser/blob/9f5fcae11cf6d99c503a15894f22efdfcd2075b7/extension/content-disposition.js
+// with the following changes:
+// - Modified to conform to PDF.js's coding style.
+// - Support UTF-8 decoding when TextDecoder is unsupported.
+// - Replace Array.from with Array + loop for compat with old browsers.
+// - Replace "startsWith" with other string method for compat with old browsers.
+// - Move return to the end of the function to prevent Babel from dropping the
+//   function declarations.
+
+/**
+ * Extract file name from the Content-Disposition HTTP response header.
+ *
+ * @param {string} contentDisposition
+ * @return {string} Filename, if found in the Content-Disposition header.
+ */
+function getFilenameFromContentDispositionHeader(contentDisposition) {
+  let needsEncodingFixup = true;
+
+  // filename*=ext-value ("ext-value" from RFC 5987, referenced by RFC 6266).
+  let tmp = /(?:^|;)\s*filename\*\s*=\s*([^;\s]+)/i.exec(contentDisposition);
+  if (tmp) {
+    tmp = tmp[1];
+    let filename = rfc2616unquote(tmp);
+    filename = unescape(filename);
+    filename = rfc5987decode(filename);
+    filename = rfc2047decode(filename);
+    return fixupEncoding(filename);
+  }
+
+  // Continuations (RFC 2231 section 3, referenced by RFC 5987 section 3.1).
+  // filename*n*=part
+  // filename*n=part
+  tmp = rfc2231getparam(contentDisposition);
+  if (tmp) {
+    // RFC 2047, section
+    let filename = rfc2047decode(tmp);
+    return fixupEncoding(filename);
+  }
+
+  // filename=value (RFC 5987, section 4.1).
+  tmp = /(?:^|;)\s*filename\s*=\s*([^;\s]+)/.exec(contentDisposition);
+  if (tmp) {
+    tmp = tmp[1];
+    let filename = rfc2616unquote(tmp);
+    filename = rfc2047decode(filename);
+    return fixupEncoding(filename);
+  }
+
+  // After this line there are only function declarations. We cannot put
+  // "return" here for readability because babel would then drop the function
+  // declarations...
+
+  function textdecode(encoding, value) {
+    if (encoding) {
+      if (!/^[^\x00-\xFF]+$/.test(value)) {
+        return value;
+      }
+      try {
+        let decoder = new TextDecoder(encoding, { fatal: true, });
+        let bytes = new Array(value.length);
+        for (let i = 0; i < value.length; ++i) {
+          bytes[i] = value.charCodeAt(0);
+        }
+        value = decoder.decode(new Uint8Array(bytes));
+        needsEncodingFixup = false;
+      } catch (e) {
+        // TextDecoder constructor threw - unrecognized encoding.
+        // Or TextDecoder API is not available.
+        if (/^utf-?8$/i.test(encoding)) {
+          // UTF-8 is commonly used, try to support it in another way:
+          value = decodeURIComponent(escape(value));
+          needsEncodingFixup = false;
+        }
+      }
+    }
+    return value;
+  }
+  function fixupEncoding(value) {
+    if (needsEncodingFixup && /[\x80-\xff]/.test(value)) {
+      // Maybe multi-byte UTF-8.
+      return textdecode('utf-8', value);
+    }
+    return value;
+  }
+  function rfc2231getparam(contentDisposition) {
+    let matches = [], match;
+    // Iterate over all filename*n= and filename*n*= with n being an integer
+    // of at least zero. Any non-zero number must not start with '0'.
+    let iter = /(?:^|;)\s*filename\*((?!0\d)\d+)(\*?)\s*=\s*([^;\s]+)/ig;
+    while ((match = iter.exec(contentDisposition)) !== null) {
+      let [, n, quot, part] = match;
+      n = parseInt(n, 10);
+      if (n in matches) {
+        // Ignore anything after the invalid second filename*0.
+        if (n === 0) {
+          break;
+        }
+        continue;
+      }
+      matches[n] = [quot, part];
+    }
+    let parts = [];
+    for (let n = 0; n < matches.length; ++n) {
+      if (!(n in matches)) {
+        // Numbers must be consecutive. Truncate when there is a hole.
+        break;
+      }
+      let [quot, part] = matches[n];
+      part = rfc2616unquote(part);
+      if (quot) {
+        part = unescape(part);
+        if (n === 0) {
+          part = rfc5987decode(part);
+        }
+      }
+      parts.push(part);
+    }
+    return parts.join('');
+  }
+  function rfc2616unquote(value) {
+    if (value.charAt(0) === '"') {
+      let parts = value.slice(1).split('\\"');
+      // Find the first unescaped " and terminate there.
+      for (let i = 0; i < parts.length; ++i) {
+        let quotindex = parts[i].indexOf('"');
+        if (quotindex !== -1) {
+          parts[i] = parts[i].slice(0, quotindex);
+          parts.length = i + 1; // Truncates and stop the iteration.
+        }
+        parts[i] = parts[i].replace(/\\(.)/g, '$1');
+      }
+      value = parts.join('"');
+    }
+    return value;
+  }
+  function rfc5987decode(extvalue) {
+    // Decodes "ext-value" from RFC 5987.
+    let encodingend = extvalue.indexOf('\'');
+    if (encodingend === -1) {
+      // Some servers send "filename*=" without encoding'language' prefix,
+      // e.g. in https://github.com/Rob--W/open-in-browser/issues/26
+      // Let's accept the value like Firefox (57) (Chrome 62 rejects it).
+      return extvalue;
+    }
+    let encoding = extvalue.slice(0, encodingend);
+    let langvalue = extvalue.slice(encodingend + 1);
+    // Ignore language (RFC 5987 section 3.2.1, and RFC 6266 section 4.1 ).
+    let value = langvalue.replace(/^[^']*'/, '');
+    return textdecode(encoding, value);
+  }
+  function rfc2047decode(value) {
+    // RFC 2047-decode the result. Firefox tried to drop support for it, but
+    // backed out because some servers use it - https://bugzil.la/875615
+    // Firefox's condition for decoding is here: https://searchfox.org/mozilla-central/rev/4a590a5a15e35d88a3b23dd6ac3c471cf85b04a8/netwerk/mime/nsMIMEHeaderParamImpl.cpp#742-748
+
+    // We are more strict and only recognize RFC 2047-encoding if the value
+    // starts with "=?", since then it is likely that the full value is
+    // RFC 2047-encoded.
+
+    // Firefox also decodes words even where RFC 2047 section 5 states:
+    // "An 'encoded-word' MUST NOT appear within a 'quoted-string'."
+    if (value.slice(0, 2) !== '=?' || /[\x00-\x19\x80-\xff]/.test(value)) {
+      return value;
+    }
+    // RFC 2047, section 2.4
+    // encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
+    // charset = token (but let's restrict to characters that denote a
+    //       possibly valid encoding).
+    // encoding = q or b
+    // encoded-text = any printable ASCII character other than ? or space.
+    //        ... but Firefox permits ? and space.
+    return value.replace(/=\?([\w\-]*)\?([QqBb])\?((?:[^?]|\?(?!=))*)\?=/g,
+      function(_, charset, encoding, text) {
+        if (encoding === 'q' || encoding === 'Q') {
+          // RFC 2047 section 4.2.
+          text = text.replace(/_/g, ' ');
+          text = text.replace(/=([0-9a-fA-F]{2})/g, function(_, hex) {
+            return String.fromCharCode(parseInt(hex, 16));
+          });
+          return textdecode(charset, text);
+        } // else encoding is b or B - base64 (RFC 2047 section 4.1)
+        try {
+          return atob(text);
+        } catch (e) {
+          return text;
+        }
+      });
+  }
+
+  return '';
+}
+
+export {
+  getFilenameFromContentDispositionHeader,
+};
diff --git a/src/display/network_utils.js b/src/display/network_utils.js
index 1b0eb0ee9..be55e4129 100644
--- a/src/display/network_utils.js
+++ b/src/display/network_utils.js
@@ -16,7 +16,9 @@
 import {
   assert, MissingPDFException, UnexpectedResponseException
 } from '../shared/util';
-import { getFilenameFromUrl } from './dom_utils';
+import {
+  getFilenameFromContentDispositionHeader
+} from './content_disposition';
 
 function validateRangeRequestCapabilities({ getResponseHeader, isHttp,
                                             rangeChunkSize, disableRange, }) {
@@ -56,10 +58,9 @@ function validateRangeRequestCapabilities({ getResponseHeader, isHttp,
 function extractFilenameFromHeader(getResponseHeader) {
   const contentDisposition = getResponseHeader('Content-Disposition');
   if (contentDisposition) {
-    let parts =
-      /.+;\s*filename=(?:'|")(.+\.pdf)(?:'|")/gi.exec(contentDisposition);
-    if (parts !== null && parts.length > 1) {
-      return getFilenameFromUrl(parts[1]);
+    let filename = getFilenameFromContentDispositionHeader(contentDisposition);
+    if (/\.pdf$/i.test(filename)) {
+      return filename;
     }
   }
   return null;
diff --git a/test/unit/network_utils_spec.js b/test/unit/network_utils_spec.js
index d4fa96e8b..56de74408 100644
--- a/test/unit/network_utils_spec.js
+++ b/test/unit/network_utils_spec.js
@@ -173,6 +173,53 @@ describe('network_utils', function() {
           return 'attachment; filename="filename.pdf"';
         }
       })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename=filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+    });
+
+    it('gets the filename from the response header (RFC 6266)', function() {
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*=filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*=\'\'filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*=utf-8\'\'filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename=no.pdf; filename*=utf-8\'\'filename.pdf';
+        }
+      })).toEqual('filename.pdf');
+
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*=utf-8\'\'filename.pdf; filename=no.pdf';
+        }
+      })).toEqual('filename.pdf');
+    });
+
+    it('gets the filename from the response header (RFC 2231)', function() {
+      // Tests continuations (RFC 2231 section 3, via RFC 5987 section 3.1).
+      expect(extractFilenameFromHeader((headerName) => {
+        if (headerName === 'Content-Disposition') {
+          return 'attachment; filename*0=filename; filename*1=.pdf';
+        }
+      })).toEqual('filename.pdf');
     });
 
     it('only extracts filename with pdf extension', function () {