From a4e907169efaadec79b75ad92101e0f35312b0b4 Mon Sep 17 00:00:00 2001 From: Rob Wu Date: Fri, 19 Jan 2018 17:39:31 +0100 Subject: [PATCH] Improve correctness of Content-Disposition parser Re-uses logic from https://github.com/Rob--W/open-in-browser/blob/9f5fcae11cf6d99c503a15894f22efdfcd2075b7/extension/content-disposition.js which is already covered by tests: https://github.com/Rob--W/open-in-browser/commit/6f3bbb8bbfc1e3e943200fffdb68d35075e82ddd --- .eslintrc | 5 +- src/display/content_disposition.js | 211 +++++++++++++++++++++++++++++ src/display/network_utils.js | 11 +- test/unit/network_utils_spec.js | 47 +++++++ 4 files changed, 268 insertions(+), 6 deletions(-) create mode 100644 src/display/content_disposition.js diff --git a/.eslintrc b/.eslintrc index 488426187..1dd046d54 100644 --- a/.eslintrc +++ b/.eslintrc @@ -120,7 +120,10 @@ "keyword-spacing": ["error", { "before": true, "after": true, }], "linebreak-style": ["error", "unix"], "lines-between-class-members": ["error", "always"], - "max-len": ["error", 80], + "max-len": ["error", { + "code": 80, + "ignoreUrls": true + }], "new-cap": ["error", { "newIsCap": true, "capIsNew": false, }], "new-parens": "error", "no-array-constructor": "error", diff --git a/src/display/content_disposition.js b/src/display/content_disposition.js new file mode 100644 index 000000000..4b3608814 --- /dev/null +++ b/src/display/content_disposition.js @@ -0,0 +1,211 @@ +/* Copyright 2017 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This getFilenameFromContentDispositionHeader function is adapted from +// https://github.com/Rob--W/open-in-browser/blob/9f5fcae11cf6d99c503a15894f22efdfcd2075b7/extension/content-disposition.js +// with the following changes: +// - Modified to conform to PDF.js's coding style. +// - Support UTF-8 decoding when TextDecoder is unsupported. +// - Replace Array.from with Array + loop for compat with old browsers. +// - Replace "startsWith" with other string method for compat with old browsers. +// - Move return to the end of the function to prevent Babel from dropping the +// function declarations. + +/** + * Extract file name from the Content-Disposition HTTP response header. + * + * @param {string} contentDisposition + * @return {string} Filename, if found in the Content-Disposition header. + */ +function getFilenameFromContentDispositionHeader(contentDisposition) { + let needsEncodingFixup = true; + + // filename*=ext-value ("ext-value" from RFC 5987, referenced by RFC 6266). + let tmp = /(?:^|;)\s*filename\*\s*=\s*([^;\s]+)/i.exec(contentDisposition); + if (tmp) { + tmp = tmp[1]; + let filename = rfc2616unquote(tmp); + filename = unescape(filename); + filename = rfc5987decode(filename); + filename = rfc2047decode(filename); + return fixupEncoding(filename); + } + + // Continuations (RFC 2231 section 3, referenced by RFC 5987 section 3.1). + // filename*n*=part + // filename*n=part + tmp = rfc2231getparam(contentDisposition); + if (tmp) { + // RFC 2047, section + let filename = rfc2047decode(tmp); + return fixupEncoding(filename); + } + + // filename=value (RFC 5987, section 4.1). + tmp = /(?:^|;)\s*filename\s*=\s*([^;\s]+)/.exec(contentDisposition); + if (tmp) { + tmp = tmp[1]; + let filename = rfc2616unquote(tmp); + filename = rfc2047decode(filename); + return fixupEncoding(filename); + } + + // After this line there are only function declarations. We cannot put + // "return" here for readability because babel would then drop the function + // declarations... + + function textdecode(encoding, value) { + if (encoding) { + if (!/^[^\x00-\xFF]+$/.test(value)) { + return value; + } + try { + let decoder = new TextDecoder(encoding, { fatal: true, }); + let bytes = new Array(value.length); + for (let i = 0; i < value.length; ++i) { + bytes[i] = value.charCodeAt(0); + } + value = decoder.decode(new Uint8Array(bytes)); + needsEncodingFixup = false; + } catch (e) { + // TextDecoder constructor threw - unrecognized encoding. + // Or TextDecoder API is not available. + if (/^utf-?8$/i.test(encoding)) { + // UTF-8 is commonly used, try to support it in another way: + value = decodeURIComponent(escape(value)); + needsEncodingFixup = false; + } + } + } + return value; + } + function fixupEncoding(value) { + if (needsEncodingFixup && /[\x80-\xff]/.test(value)) { + // Maybe multi-byte UTF-8. + return textdecode('utf-8', value); + } + return value; + } + function rfc2231getparam(contentDisposition) { + let matches = [], match; + // Iterate over all filename*n= and filename*n*= with n being an integer + // of at least zero. Any non-zero number must not start with '0'. + let iter = /(?:^|;)\s*filename\*((?!0\d)\d+)(\*?)\s*=\s*([^;\s]+)/ig; + while ((match = iter.exec(contentDisposition)) !== null) { + let [, n, quot, part] = match; + n = parseInt(n, 10); + if (n in matches) { + // Ignore anything after the invalid second filename*0. + if (n === 0) { + break; + } + continue; + } + matches[n] = [quot, part]; + } + let parts = []; + for (let n = 0; n < matches.length; ++n) { + if (!(n in matches)) { + // Numbers must be consecutive. Truncate when there is a hole. + break; + } + let [quot, part] = matches[n]; + part = rfc2616unquote(part); + if (quot) { + part = unescape(part); + if (n === 0) { + part = rfc5987decode(part); + } + } + parts.push(part); + } + return parts.join(''); + } + function rfc2616unquote(value) { + if (value.charAt(0) === '"') { + let parts = value.slice(1).split('\\"'); + // Find the first unescaped " and terminate there. + for (let i = 0; i < parts.length; ++i) { + let quotindex = parts[i].indexOf('"'); + if (quotindex !== -1) { + parts[i] = parts[i].slice(0, quotindex); + parts.length = i + 1; // Truncates and stop the iteration. + } + parts[i] = parts[i].replace(/\\(.)/g, '$1'); + } + value = parts.join('"'); + } + return value; + } + function rfc5987decode(extvalue) { + // Decodes "ext-value" from RFC 5987. + let encodingend = extvalue.indexOf('\''); + if (encodingend === -1) { + // Some servers send "filename*=" without encoding'language' prefix, + // e.g. in https://github.com/Rob--W/open-in-browser/issues/26 + // Let's accept the value like Firefox (57) (Chrome 62 rejects it). + return extvalue; + } + let encoding = extvalue.slice(0, encodingend); + let langvalue = extvalue.slice(encodingend + 1); + // Ignore language (RFC 5987 section 3.2.1, and RFC 6266 section 4.1 ). + let value = langvalue.replace(/^[^']*'/, ''); + return textdecode(encoding, value); + } + function rfc2047decode(value) { + // RFC 2047-decode the result. Firefox tried to drop support for it, but + // backed out because some servers use it - https://bugzil.la/875615 + // Firefox's condition for decoding is here: https://searchfox.org/mozilla-central/rev/4a590a5a15e35d88a3b23dd6ac3c471cf85b04a8/netwerk/mime/nsMIMEHeaderParamImpl.cpp#742-748 + + // We are more strict and only recognize RFC 2047-encoding if the value + // starts with "=?", since then it is likely that the full value is + // RFC 2047-encoded. + + // Firefox also decodes words even where RFC 2047 section 5 states: + // "An 'encoded-word' MUST NOT appear within a 'quoted-string'." + if (value.slice(0, 2) !== '=?' || /[\x00-\x19\x80-\xff]/.test(value)) { + return value; + } + // RFC 2047, section 2.4 + // encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" + // charset = token (but let's restrict to characters that denote a + // possibly valid encoding). + // encoding = q or b + // encoded-text = any printable ASCII character other than ? or space. + // ... but Firefox permits ? and space. + return value.replace(/=\?([\w\-]*)\?([QqBb])\?((?:[^?]|\?(?!=))*)\?=/g, + function(_, charset, encoding, text) { + if (encoding === 'q' || encoding === 'Q') { + // RFC 2047 section 4.2. + text = text.replace(/_/g, ' '); + text = text.replace(/=([0-9a-fA-F]{2})/g, function(_, hex) { + return String.fromCharCode(parseInt(hex, 16)); + }); + return textdecode(charset, text); + } // else encoding is b or B - base64 (RFC 2047 section 4.1) + try { + return atob(text); + } catch (e) { + return text; + } + }); + } + + return ''; +} + +export { + getFilenameFromContentDispositionHeader, +}; diff --git a/src/display/network_utils.js b/src/display/network_utils.js index 1b0eb0ee9..be55e4129 100644 --- a/src/display/network_utils.js +++ b/src/display/network_utils.js @@ -16,7 +16,9 @@ import { assert, MissingPDFException, UnexpectedResponseException } from '../shared/util'; -import { getFilenameFromUrl } from './dom_utils'; +import { + getFilenameFromContentDispositionHeader +} from './content_disposition'; function validateRangeRequestCapabilities({ getResponseHeader, isHttp, rangeChunkSize, disableRange, }) { @@ -56,10 +58,9 @@ function validateRangeRequestCapabilities({ getResponseHeader, isHttp, function extractFilenameFromHeader(getResponseHeader) { const contentDisposition = getResponseHeader('Content-Disposition'); if (contentDisposition) { - let parts = - /.+;\s*filename=(?:'|")(.+\.pdf)(?:'|")/gi.exec(contentDisposition); - if (parts !== null && parts.length > 1) { - return getFilenameFromUrl(parts[1]); + let filename = getFilenameFromContentDispositionHeader(contentDisposition); + if (/\.pdf$/i.test(filename)) { + return filename; } } return null; diff --git a/test/unit/network_utils_spec.js b/test/unit/network_utils_spec.js index d4fa96e8b..56de74408 100644 --- a/test/unit/network_utils_spec.js +++ b/test/unit/network_utils_spec.js @@ -173,6 +173,53 @@ describe('network_utils', function() { return 'attachment; filename="filename.pdf"'; } })).toEqual('filename.pdf'); + + expect(extractFilenameFromHeader((headerName) => { + if (headerName === 'Content-Disposition') { + return 'attachment; filename=filename.pdf'; + } + })).toEqual('filename.pdf'); + }); + + it('gets the filename from the response header (RFC 6266)', function() { + expect(extractFilenameFromHeader((headerName) => { + if (headerName === 'Content-Disposition') { + return 'attachment; filename*=filename.pdf'; + } + })).toEqual('filename.pdf'); + + expect(extractFilenameFromHeader((headerName) => { + if (headerName === 'Content-Disposition') { + return 'attachment; filename*=\'\'filename.pdf'; + } + })).toEqual('filename.pdf'); + + expect(extractFilenameFromHeader((headerName) => { + if (headerName === 'Content-Disposition') { + return 'attachment; filename*=utf-8\'\'filename.pdf'; + } + })).toEqual('filename.pdf'); + + expect(extractFilenameFromHeader((headerName) => { + if (headerName === 'Content-Disposition') { + return 'attachment; filename=no.pdf; filename*=utf-8\'\'filename.pdf'; + } + })).toEqual('filename.pdf'); + + expect(extractFilenameFromHeader((headerName) => { + if (headerName === 'Content-Disposition') { + return 'attachment; filename*=utf-8\'\'filename.pdf; filename=no.pdf'; + } + })).toEqual('filename.pdf'); + }); + + it('gets the filename from the response header (RFC 2231)', function() { + // Tests continuations (RFC 2231 section 3, via RFC 5987 section 3.1). + expect(extractFilenameFromHeader((headerName) => { + if (headerName === 'Content-Disposition') { + return 'attachment; filename*0=filename; filename*1=.pdf'; + } + })).toEqual('filename.pdf'); }); it('only extracts filename with pdf extension', function () {