diff --git a/src/core/catalog.js b/src/core/catalog.js index f4bfd1161..c63e67d1b 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -13,14 +13,6 @@ * limitations under the License. */ -import { - addDefaultProtocolToUrl, - collectActions, - MissingDataException, - recoverJsURL, - toRomanNumerals, - tryConvertUrlEncoding, -} from "./core_utils.js"; import { clearPrimitiveCaches, Dict, @@ -29,9 +21,16 @@ import { isRef, isRefsEqual, isStream, + Name, RefSet, RefSetCache, } from "./primitives.js"; +import { + collectActions, + MissingDataException, + recoverJsURL, + toRomanNumerals, +} from "./core_utils.js"; import { createPromiseCapability, createValidAbsoluteUrl, @@ -1331,11 +1330,9 @@ class Catalog { switch (actionName) { case "URI": url = action.get("URI"); - if (isName(url)) { + if (url instanceof Name) { // Some bad PDFs do not put parentheses around relative URLs. url = "/" + url.name; - } else if (isString(url)) { - url = addDefaultProtocolToUrl(url); } // TODO: pdf spec mentions urls can be relative to a Base // entry in the dictionary. @@ -1426,8 +1423,10 @@ class Catalog { } if (isString(url)) { - url = tryConvertUrlEncoding(url); - const absoluteUrl = createValidAbsoluteUrl(url, docBaseUrl); + const absoluteUrl = createValidAbsoluteUrl(url, docBaseUrl, { + addDefaultProtocol: true, + tryConvertEncoding: true, + }); if (absoluteUrl) { resultObj.url = absoluteUrl.href; } diff --git a/src/core/core_utils.js b/src/core/core_utils.js index bf3f18388..2e9b19e3f 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -18,7 +18,6 @@ import { BaseException, objectSize, stringToPDFString, - stringToUTF8String, warn, } from "../shared/util.js"; import { Dict, isName, isRef, isStream, RefSet } from "./primitives.js"; @@ -452,21 +451,6 @@ function validateCSSFont(cssFontInfo) { return true; } -// Let URLs beginning with 'www.' default to using the 'http://' protocol. -function addDefaultProtocolToUrl(url) { - return url.startsWith("www.") ? `http://${url}` : url; -} - -// According to ISO 32000-1:2008, section 12.6.4.7, URIs should be encoded -// in 7-bit ASCII. Some bad PDFs use UTF-8 encoding; see Bugzilla 1122280. -function tryConvertUrlEncoding(url) { - try { - return stringToUTF8String(url); - } catch (e) { - return url; - } -} - function recoverJsURL(str) { // Attempt to recover valid URLs from `JS` entries with certain // white-listed formats: @@ -496,7 +480,6 @@ function recoverJsURL(str) { } export { - addDefaultProtocolToUrl, collectActions, encodeToXmlString, escapePDFName, @@ -513,7 +496,6 @@ export { readUint32, recoverJsURL, toRomanNumerals, - tryConvertUrlEncoding, validateCSSFont, XRefEntryException, XRefParseException, diff --git a/src/core/xfa/html_utils.js b/src/core/xfa/html_utils.js index 22a5c1785..463611f9c 100644 --- a/src/core/xfa/html_utils.js +++ b/src/core/xfa/html_utils.js @@ -26,10 +26,6 @@ import { $toStyle, XFAObject, } from "./xfa_object.js"; -import { - addDefaultProtocolToUrl, - tryConvertUrlEncoding, -} from "../core_utils.js"; import { createValidAbsoluteUrl, warn } from "../../shared/util.js"; import { getMeasurement, stripQuotes } from "./utils.js"; import { selectFont } from "./fonts.js"; @@ -638,15 +634,11 @@ function setFontFamily(xfaFont, node, fontFinder, style) { } function fixURL(str) { - if (typeof str === "string") { - let url = addDefaultProtocolToUrl(str); - url = tryConvertUrlEncoding(url); - const absoluteUrl = createValidAbsoluteUrl(url); - if (absoluteUrl) { - return absoluteUrl.href; - } - } - return null; + const absoluteUrl = createValidAbsoluteUrl(str, /* baseUrl = */ null, { + addDefaultProtocol: true, + tryConvertEncoding: true, + }); + return absoluteUrl ? absoluteUrl.href : null; } export { diff --git a/src/shared/util.js b/src/shared/util.js index 2afaa1674..d3e43a157 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -448,14 +448,35 @@ function _isValidProtocol(url) { * Attempts to create a valid absolute URL. * * @param {URL|string} url - An absolute, or relative, URL. - * @param {URL|string} baseUrl - An absolute URL. + * @param {URL|string} [baseUrl] - An absolute URL. + * @param {Object} [options] * @returns Either a valid {URL}, or `null` otherwise. */ -function createValidAbsoluteUrl(url, baseUrl) { +function createValidAbsoluteUrl(url, baseUrl = null, options = null) { if (!url) { return null; } try { + if (options && typeof url === "string") { + // Let URLs beginning with "www." default to using the "http://" protocol. + if (options.addDefaultProtocol && url.startsWith("www.")) { + const dots = url.match(/\./g); + // Avoid accidentally matching a *relative* URL pointing to a file named + // e.g. "www.pdf" or similar. + if (dots && dots.length >= 2) { + url = `http://${url}`; + } + } + + // According to ISO 32000-1:2008, section 12.6.4.7, URIs should be encoded + // in 7-bit ASCII. Some bad PDFs use UTF-8 encoding; see bug 1122280. + if (options.tryConvertEncoding) { + try { + url = stringToUTF8String(url); + } catch (ex) {} + } + } + const absoluteUrl = baseUrl ? new URL(url, baseUrl) : new URL(url); if (_isValidProtocol(absoluteUrl)) { return absoluteUrl; diff --git a/test/unit/annotation_spec.js b/test/unit/annotation_spec.js index 9a0c22ca7..26a470fe6 100644 --- a/test/unit/annotation_spec.js +++ b/test/unit/annotation_spec.js @@ -796,7 +796,7 @@ describe("annotation", function () { ); expect(data.annotationType).toEqual(AnnotationType.LINK); expect(data.url).toEqual("http://www.hmrc.gov.uk/"); - expect(data.unsafeUrl).toEqual("http://www.hmrc.gov.uk"); + expect(data.unsafeUrl).toEqual("www.hmrc.gov.uk"); expect(data.dest).toBeUndefined(); } ); @@ -843,7 +843,7 @@ describe("annotation", function () { ).href ); expect(data.unsafeUrl).toEqual( - stringToUTF8String("http://www.example.com/\xC3\xBC\xC3\xB6\xC3\xA4") + "http://www.example.com/\xC3\xBC\xC3\xB6\xC3\xA4" ); expect(data.dest).toBeUndefined(); }