diff --git a/src/display/display_utils.js b/src/display/display_utils.js index c118b04a4..3cb0da278 100644 --- a/src/display/display_utils.js +++ b/src/display/display_utils.js @@ -451,13 +451,23 @@ function addLinkAttributes(link, { url, target, rel, enabled = true } = {}) { link.rel = typeof rel === "string" ? rel : DEFAULT_LINK_REL; } +function isDataScheme(url) { + const ii = url.length; + let i = 0; + while (i < ii && url[i].trim() === "") { + i++; + } + return url.substring(i, i + 5).toLowerCase() === "data:"; +} + function isPdfFile(filename) { return typeof filename === "string" && /\.pdf$/i.test(filename); } /** - * Gets the file name from a given URL. + * Gets the filename from a given URL. * @param {string} url + * @returns {string} */ function getFilenameFromUrl(url) { const anchor = url.indexOf("#"); @@ -469,6 +479,48 @@ function getFilenameFromUrl(url) { return url.substring(url.lastIndexOf("/", end) + 1, end); } +/** + * Returns the filename or guessed filename from the url (see issue 3455). + * @param {string} url - The original PDF location. + * @param {string} defaultFilename - The value returned if the filename is + * unknown, or the protocol is unsupported. + * @returns {string} Guessed PDF filename. + */ +function getPdfFilenameFromUrl(url, defaultFilename = "document.pdf") { + if (typeof url !== "string") { + return defaultFilename; + } + if (isDataScheme(url)) { + warn('getPdfFilenameFromUrl: ignore "data:"-URL for performance reasons.'); + return defaultFilename; + } + const reURI = /^(?:(?:[^:]+:)?\/\/[^/]+)?([^?#]*)(\?[^#]*)?(#.*)?$/; + // SCHEME HOST 1.PATH 2.QUERY 3.REF + // Pattern to get last matching NAME.pdf + const reFilename = /[^/?#=]+\.pdf\b(?!.*\.pdf\b)/i; + const splitURI = reURI.exec(url); + let suggestedFilename = + reFilename.exec(splitURI[1]) || + reFilename.exec(splitURI[2]) || + reFilename.exec(splitURI[3]); + if (suggestedFilename) { + suggestedFilename = suggestedFilename[0]; + if (suggestedFilename.includes("%")) { + // URL-encoded %2Fpath%2Fto%2Ffile.pdf should be file.pdf + try { + suggestedFilename = reFilename.exec( + decodeURIComponent(suggestedFilename) + )[0]; + } catch (ex) { + // Possible (extremely rare) errors: + // URIError "Malformed URI", e.g. for "%AA.pdf" + // TypeError "null has no properties", e.g. for "%2F.pdf" + } + } + } + return suggestedFilename || defaultFilename; +} + class StatTimer { constructor() { this.started = Object.create(null); @@ -655,6 +707,7 @@ export { DOMCMapReaderFactory, DOMSVGFactory, getFilenameFromUrl, + getPdfFilenameFromUrl, isFetchSupported, isPdfFile, isValidFetchUrl, diff --git a/src/pdf.js b/src/pdf.js index af6bfc96d..aef4733c8 100644 --- a/src/pdf.js +++ b/src/pdf.js @@ -17,6 +17,7 @@ import { addLinkAttributes, getFilenameFromUrl, + getPdfFilenameFromUrl, isFetchSupported, isPdfFile, isValidFetchUrl, @@ -129,6 +130,7 @@ export { // From "./display/display_utils.js": addLinkAttributes, getFilenameFromUrl, + getPdfFilenameFromUrl, isPdfFile, LinkTarget, loadScript, diff --git a/test/unit/display_utils_spec.js b/test/unit/display_utils_spec.js index 275263265..3a679830f 100644 --- a/test/unit/display_utils_spec.js +++ b/test/unit/display_utils_spec.js @@ -17,9 +17,11 @@ import { DOMCanvasFactory, DOMSVGFactory, getFilenameFromUrl, + getPdfFilenameFromUrl, isValidFetchUrl, PDFDateString, } from "../../src/display/display_utils.js"; +import { createObjectURL } from "../../src/shared/util.js"; import { isNodeJS } from "../../src/shared/is_node.js"; describe("display_utils", function () { @@ -192,6 +194,162 @@ describe("display_utils", function () { }); }); + describe("getPdfFilenameFromUrl", function () { + it("gets PDF filename", function () { + // Relative URL + expect(getPdfFilenameFromUrl("/pdfs/file1.pdf")).toEqual("file1.pdf"); + // Absolute URL + expect( + getPdfFilenameFromUrl("http://www.example.com/pdfs/file2.pdf") + ).toEqual("file2.pdf"); + }); + + it("gets fallback filename", function () { + // Relative URL + expect(getPdfFilenameFromUrl("/pdfs/file1.txt")).toEqual("document.pdf"); + // Absolute URL + expect( + getPdfFilenameFromUrl("http://www.example.com/pdfs/file2.txt") + ).toEqual("document.pdf"); + }); + + it("gets custom fallback filename", function () { + // Relative URL + expect(getPdfFilenameFromUrl("/pdfs/file1.txt", "qwerty1.pdf")).toEqual( + "qwerty1.pdf" + ); + // Absolute URL + expect( + getPdfFilenameFromUrl( + "http://www.example.com/pdfs/file2.txt", + "qwerty2.pdf" + ) + ).toEqual("qwerty2.pdf"); + + // An empty string should be a valid custom fallback filename. + expect(getPdfFilenameFromUrl("/pdfs/file3.txt", "")).toEqual(""); + }); + + it("gets fallback filename when url is not a string", function () { + expect(getPdfFilenameFromUrl(null)).toEqual("document.pdf"); + + expect(getPdfFilenameFromUrl(null, "file.pdf")).toEqual("file.pdf"); + }); + + it("gets PDF filename from URL containing leading/trailing whitespace", function () { + // Relative URL + expect(getPdfFilenameFromUrl(" /pdfs/file1.pdf ")).toEqual( + "file1.pdf" + ); + // Absolute URL + expect( + getPdfFilenameFromUrl(" http://www.example.com/pdfs/file2.pdf ") + ).toEqual("file2.pdf"); + }); + + it("gets PDF filename from query string", function () { + // Relative URL + expect(getPdfFilenameFromUrl("/pdfs/pdfs.html?name=file1.pdf")).toEqual( + "file1.pdf" + ); + // Absolute URL + expect( + getPdfFilenameFromUrl("http://www.example.com/pdfs/pdf.html?file2.pdf") + ).toEqual("file2.pdf"); + }); + + it("gets PDF filename from hash string", function () { + // Relative URL + expect(getPdfFilenameFromUrl("/pdfs/pdfs.html#name=file1.pdf")).toEqual( + "file1.pdf" + ); + // Absolute URL + expect( + getPdfFilenameFromUrl("http://www.example.com/pdfs/pdf.html#file2.pdf") + ).toEqual("file2.pdf"); + }); + + it("gets correct PDF filename when multiple ones are present", function () { + // Relative URL + expect(getPdfFilenameFromUrl("/pdfs/file1.pdf?name=file.pdf")).toEqual( + "file1.pdf" + ); + // Absolute URL + expect( + getPdfFilenameFromUrl("http://www.example.com/pdfs/file2.pdf#file.pdf") + ).toEqual("file2.pdf"); + }); + + it("gets PDF filename from URI-encoded data", function () { + const encodedUrl = encodeURIComponent( + "http://www.example.com/pdfs/file1.pdf" + ); + expect(getPdfFilenameFromUrl(encodedUrl)).toEqual("file1.pdf"); + + const encodedUrlWithQuery = encodeURIComponent( + "http://www.example.com/pdfs/file.txt?file2.pdf" + ); + expect(getPdfFilenameFromUrl(encodedUrlWithQuery)).toEqual("file2.pdf"); + }); + + it("gets PDF filename from data mistaken for URI-encoded", function () { + expect(getPdfFilenameFromUrl("/pdfs/%AA.pdf")).toEqual("%AA.pdf"); + + expect(getPdfFilenameFromUrl("/pdfs/%2F.pdf")).toEqual("%2F.pdf"); + }); + + it("gets PDF filename from (some) standard protocols", function () { + // HTTP + expect(getPdfFilenameFromUrl("http://www.example.com/file1.pdf")).toEqual( + "file1.pdf" + ); + // HTTPS + expect( + getPdfFilenameFromUrl("https://www.example.com/file2.pdf") + ).toEqual("file2.pdf"); + // File + expect(getPdfFilenameFromUrl("file:///path/to/files/file3.pdf")).toEqual( + "file3.pdf" + ); + // FTP + expect(getPdfFilenameFromUrl("ftp://www.example.com/file4.pdf")).toEqual( + "file4.pdf" + ); + }); + + it('gets PDF filename from query string appended to "blob:" URL', function () { + if (isNodeJS) { + pending("Blob in not supported in Node.js."); + } + const typedArray = new Uint8Array([1, 2, 3, 4, 5]); + const blobUrl = createObjectURL(typedArray, "application/pdf"); + // Sanity check to ensure that a "blob:" URL was returned. + expect(blobUrl.startsWith("blob:")).toEqual(true); + + expect(getPdfFilenameFromUrl(blobUrl + "?file.pdf")).toEqual("file.pdf"); + }); + + it('gets fallback filename from query string appended to "data:" URL', function () { + const typedArray = new Uint8Array([1, 2, 3, 4, 5]); + const dataUrl = createObjectURL( + typedArray, + "application/pdf", + /* forceDataSchema = */ true + ); + // Sanity check to ensure that a "data:" URL was returned. + expect(dataUrl.startsWith("data:")).toEqual(true); + + expect(getPdfFilenameFromUrl(dataUrl + "?file1.pdf")).toEqual( + "document.pdf" + ); + + // Should correctly detect a "data:" URL with leading whitespace. + expect(getPdfFilenameFromUrl(" " + dataUrl + "?file2.pdf")).toEqual( + "document.pdf" + ); + }); + }); + describe("isValidFetchUrl", function () { it("handles invalid Fetch URLs", function () { expect(isValidFetchUrl(null)).toEqual(false); diff --git a/test/unit/ui_utils_spec.js b/test/unit/ui_utils_spec.js index b3ca544da..baae83661 100644 --- a/test/unit/ui_utils_spec.js +++ b/test/unit/ui_utils_spec.js @@ -18,7 +18,6 @@ import { binarySearchFirstItem, EventBus, getPageSizeInches, - getPDFFileNameFromURL, getVisibleElements, isPortraitOrientation, isValidRotation, @@ -26,7 +25,6 @@ import { waitOnEventOrTimeout, WaitOnType, } from "../../web/ui_utils.js"; -import { createObjectURL } from "../../src/shared/util.js"; import { isNodeJS } from "../../src/shared/is_node.js"; describe("ui_utils", function () { @@ -58,162 +56,6 @@ describe("ui_utils", function () { }); }); - describe("getPDFFileNameFromURL", function () { - it("gets PDF filename", function () { - // Relative URL - expect(getPDFFileNameFromURL("/pdfs/file1.pdf")).toEqual("file1.pdf"); - // Absolute URL - expect( - getPDFFileNameFromURL("http://www.example.com/pdfs/file2.pdf") - ).toEqual("file2.pdf"); - }); - - it("gets fallback filename", function () { - // Relative URL - expect(getPDFFileNameFromURL("/pdfs/file1.txt")).toEqual("document.pdf"); - // Absolute URL - expect( - getPDFFileNameFromURL("http://www.example.com/pdfs/file2.txt") - ).toEqual("document.pdf"); - }); - - it("gets custom fallback filename", function () { - // Relative URL - expect(getPDFFileNameFromURL("/pdfs/file1.txt", "qwerty1.pdf")).toEqual( - "qwerty1.pdf" - ); - // Absolute URL - expect( - getPDFFileNameFromURL( - "http://www.example.com/pdfs/file2.txt", - "qwerty2.pdf" - ) - ).toEqual("qwerty2.pdf"); - - // An empty string should be a valid custom fallback filename. - expect(getPDFFileNameFromURL("/pdfs/file3.txt", "")).toEqual(""); - }); - - it("gets fallback filename when url is not a string", function () { - expect(getPDFFileNameFromURL(null)).toEqual("document.pdf"); - - expect(getPDFFileNameFromURL(null, "file.pdf")).toEqual("file.pdf"); - }); - - it("gets PDF filename from URL containing leading/trailing whitespace", function () { - // Relative URL - expect(getPDFFileNameFromURL(" /pdfs/file1.pdf ")).toEqual( - "file1.pdf" - ); - // Absolute URL - expect( - getPDFFileNameFromURL(" http://www.example.com/pdfs/file2.pdf ") - ).toEqual("file2.pdf"); - }); - - it("gets PDF filename from query string", function () { - // Relative URL - expect(getPDFFileNameFromURL("/pdfs/pdfs.html?name=file1.pdf")).toEqual( - "file1.pdf" - ); - // Absolute URL - expect( - getPDFFileNameFromURL("http://www.example.com/pdfs/pdf.html?file2.pdf") - ).toEqual("file2.pdf"); - }); - - it("gets PDF filename from hash string", function () { - // Relative URL - expect(getPDFFileNameFromURL("/pdfs/pdfs.html#name=file1.pdf")).toEqual( - "file1.pdf" - ); - // Absolute URL - expect( - getPDFFileNameFromURL("http://www.example.com/pdfs/pdf.html#file2.pdf") - ).toEqual("file2.pdf"); - }); - - it("gets correct PDF filename when multiple ones are present", function () { - // Relative URL - expect(getPDFFileNameFromURL("/pdfs/file1.pdf?name=file.pdf")).toEqual( - "file1.pdf" - ); - // Absolute URL - expect( - getPDFFileNameFromURL("http://www.example.com/pdfs/file2.pdf#file.pdf") - ).toEqual("file2.pdf"); - }); - - it("gets PDF filename from URI-encoded data", function () { - const encodedUrl = encodeURIComponent( - "http://www.example.com/pdfs/file1.pdf" - ); - expect(getPDFFileNameFromURL(encodedUrl)).toEqual("file1.pdf"); - - const encodedUrlWithQuery = encodeURIComponent( - "http://www.example.com/pdfs/file.txt?file2.pdf" - ); - expect(getPDFFileNameFromURL(encodedUrlWithQuery)).toEqual("file2.pdf"); - }); - - it("gets PDF filename from data mistaken for URI-encoded", function () { - expect(getPDFFileNameFromURL("/pdfs/%AA.pdf")).toEqual("%AA.pdf"); - - expect(getPDFFileNameFromURL("/pdfs/%2F.pdf")).toEqual("%2F.pdf"); - }); - - it("gets PDF filename from (some) standard protocols", function () { - // HTTP - expect(getPDFFileNameFromURL("http://www.example.com/file1.pdf")).toEqual( - "file1.pdf" - ); - // HTTPS - expect( - getPDFFileNameFromURL("https://www.example.com/file2.pdf") - ).toEqual("file2.pdf"); - // File - expect(getPDFFileNameFromURL("file:///path/to/files/file3.pdf")).toEqual( - "file3.pdf" - ); - // FTP - expect(getPDFFileNameFromURL("ftp://www.example.com/file4.pdf")).toEqual( - "file4.pdf" - ); - }); - - it('gets PDF filename from query string appended to "blob:" URL', function () { - if (isNodeJS) { - pending("Blob in not supported in Node.js."); - } - const typedArray = new Uint8Array([1, 2, 3, 4, 5]); - const blobUrl = createObjectURL(typedArray, "application/pdf"); - // Sanity check to ensure that a "blob:" URL was returned. - expect(blobUrl.startsWith("blob:")).toEqual(true); - - expect(getPDFFileNameFromURL(blobUrl + "?file.pdf")).toEqual("file.pdf"); - }); - - it('gets fallback filename from query string appended to "data:" URL', function () { - const typedArray = new Uint8Array([1, 2, 3, 4, 5]); - const dataUrl = createObjectURL( - typedArray, - "application/pdf", - /* forceDataSchema = */ true - ); - // Sanity check to ensure that a "data:" URL was returned. - expect(dataUrl.startsWith("data:")).toEqual(true); - - expect(getPDFFileNameFromURL(dataUrl + "?file1.pdf")).toEqual( - "document.pdf" - ); - - // Should correctly detect a "data:" URL with leading whitespace. - expect(getPDFFileNameFromURL(" " + dataUrl + "?file2.pdf")).toEqual( - "document.pdf" - ); - }); - }); - describe("EventBus", function () { it("dispatch event", function () { const eventBus = new EventBus(); diff --git a/web/app.js b/web/app.js index 6bdcaef98..28e779418 100644 --- a/web/app.js +++ b/web/app.js @@ -22,7 +22,6 @@ import { DEFAULT_SCALE_VALUE, EventBus, getActiveOrFocusedElement, - getPDFFileNameFromURL, isValidRotation, isValidScrollMode, isValidSpreadMode, @@ -44,6 +43,7 @@ import { createPromiseCapability, getDocument, getFilenameFromUrl, + getPdfFilenameFromUrl, GlobalWorkerOptions, InvalidPDFException, isPdfFile, @@ -748,7 +748,7 @@ const PDFViewerApplication = { setTitleUsingUrl(url = "") { this.url = url; this.baseUrl = url.split("#")[0]; - let title = getPDFFileNameFromURL(url, ""); + let title = getPdfFilenameFromUrl(url, ""); if (!title) { try { title = decodeURIComponent(getFilenameFromUrl(url)) || url; @@ -772,7 +772,7 @@ const PDFViewerApplication = { get _docFilename() { // Use `this.url` instead of `this.baseUrl` to perform filename detection // based on the reference fragment as ultimate fallback if needed. - return this._contentDispositionFilename || getPDFFileNameFromURL(this.url); + return this._contentDispositionFilename || getPdfFilenameFromUrl(this.url); }, /** diff --git a/web/generic_scripting.js b/web/generic_scripting.js index b1748f9bb..08cfff3a1 100644 --- a/web/generic_scripting.js +++ b/web/generic_scripting.js @@ -13,8 +13,7 @@ * limitations under the License. */ -import { getPDFFileNameFromURL } from "./ui_utils.js"; -import { loadScript } from "pdfjs-lib"; +import { getPdfFilenameFromUrl, loadScript } from "pdfjs-lib"; async function docPropertiesLookup(pdfDocument) { const url = "", @@ -37,7 +36,7 @@ async function docPropertiesLookup(pdfDocument) { ...info, baseURL: baseUrl, filesize: contentLength, - filename: contentDispositionFilename || getPDFFileNameFromURL(url), + filename: contentDispositionFilename || getPdfFilenameFromUrl(url), metadata: metadata?.getRaw(), authors: metadata?.get("dc:creator"), numPages: pdfDocument.numPages, diff --git a/web/pdf_document_properties.js b/web/pdf_document_properties.js index ab1173b6b..4b8833444 100644 --- a/web/pdf_document_properties.js +++ b/web/pdf_document_properties.js @@ -13,12 +13,12 @@ * limitations under the License. */ -import { createPromiseCapability, PDFDateString } from "pdfjs-lib"; import { - getPageSizeInches, - getPDFFileNameFromURL, - isPortraitOrientation, -} from "./ui_utils.js"; + createPromiseCapability, + getPdfFilenameFromUrl, + PDFDateString, +} from "pdfjs-lib"; +import { getPageSizeInches, isPortraitOrientation } from "./ui_utils.js"; const DEFAULT_FIELD_CONTENT = "-"; @@ -140,7 +140,7 @@ class PDFDocumentProperties { pageSize, isLinearized, ] = await Promise.all([ - contentDispositionFilename || getPDFFileNameFromURL(this.url), + contentDispositionFilename || getPdfFilenameFromUrl(this.url), this._parseFileSize(contentLength), this._parseDate(info.CreationDate), this._parseDate(info.ModDate), diff --git a/web/ui_utils.js b/web/ui_utils.js index c90bfd428..a5995a6c9 100644 --- a/web/ui_utils.js +++ b/web/ui_utils.js @@ -570,60 +570,6 @@ function noContextMenuHandler(evt) { evt.preventDefault(); } -function isDataSchema(url) { - let i = 0; - const ii = url.length; - while (i < ii && url[i].trim() === "") { - i++; - } - return url.substring(i, i + 5).toLowerCase() === "data:"; -} - -/** - * Returns the filename or guessed filename from the url (see issue 3455). - * @param {string} url - The original PDF location. - * @param {string} defaultFilename - The value returned if the filename is - * unknown, or the protocol is unsupported. - * @returns {string} Guessed PDF filename. - */ -function getPDFFileNameFromURL(url, defaultFilename = "document.pdf") { - if (typeof url !== "string") { - return defaultFilename; - } - if (isDataSchema(url)) { - console.warn( - "getPDFFileNameFromURL: " + - 'ignoring "data:" URL for performance reasons.' - ); - return defaultFilename; - } - const reURI = /^(?:(?:[^:]+:)?\/\/[^/]+)?([^?#]*)(\?[^#]*)?(#.*)?$/; - // SCHEME HOST 1.PATH 2.QUERY 3.REF - // Pattern to get last matching NAME.pdf - const reFilename = /[^/?#=]+\.pdf\b(?!.*\.pdf\b)/i; - const splitURI = reURI.exec(url); - let suggestedFilename = - reFilename.exec(splitURI[1]) || - reFilename.exec(splitURI[2]) || - reFilename.exec(splitURI[3]); - if (suggestedFilename) { - suggestedFilename = suggestedFilename[0]; - if (suggestedFilename.includes("%")) { - // URL-encoded %2Fpath%2Fto%2Ffile.pdf should be file.pdf - try { - suggestedFilename = reFilename.exec( - decodeURIComponent(suggestedFilename) - )[0]; - } catch (ex) { - // Possible (extremely rare) errors: - // URIError "Malformed URI", e.g. for "%AA.pdf" - // TypeError "null has no properties", e.g. for "%2F.pdf" - } - } - } - return suggestedFilename || defaultFilename; -} - function normalizeWheelEventDirection(evt) { let delta = Math.hypot(evt.deltaX, evt.deltaY); const angle = Math.atan2(evt.deltaY, evt.deltaX); @@ -1063,7 +1009,6 @@ export { getActiveOrFocusedElement, getOutputScale, getPageSizeInches, - getPDFFileNameFromURL, getVisibleElements, isPortraitOrientation, isValidRotation,