From 5bc7339c1b51f982e79f4f2e296eedea22f8d4e3 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 19 Apr 2022 16:53:44 +0200 Subject: [PATCH] Add support for the /Catalog Base-URI when resolving URLs (issue 14802) As far as I can tell, this is actually the very first time that we've seen a PDF document with a Base-URI specified in the /Catalog; please refer to the specification: https://web.archive.org/web/20220309040754if_/https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf#G11.2097122 To simplify the overall implementation, this new parameter is accessed via the existing `BasePdfManager.docBaseUrl`-getter and will thus override any user-specified `docBaseUrl` API-parameter. --- src/core/annotation.js | 5 ++- src/core/catalog.js | 18 +++++++- src/core/pdf_manager.js | 10 ++++- test/pdfs/.gitignore | 1 + test/pdfs/issue14802.pdf | 89 ++++++++++++++++++++++++++++++++++++++++ test/test_manifest.json | 8 ++++ 6 files changed, 126 insertions(+), 5 deletions(-) create mode 100644 test/pdfs/issue14802.pdf diff --git a/src/core/annotation.js b/src/core/annotation.js index 056af502f..195a9cf1e 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -72,9 +72,12 @@ class AnnotationFactory { static create(xref, ref, pdfManager, idFactory, collectFields) { return Promise.all([ pdfManager.ensureCatalog("acroForm"), + // Only necessary to prevent the `pdfManager.docBaseUrl`-getter, used + // with certain Annotations, from throwing and thus breaking parsing: + pdfManager.ensureCatalog("baseUrl"), pdfManager.ensureDoc("xfaDatasets"), collectFields ? this._getPageIndex(xref, ref, pdfManager) : -1, - ]).then(([acroForm, xfaDatasets, pageIndex]) => + ]).then(([acroForm, baseUrl, xfaDatasets, pageIndex]) => pdfManager.ensure(this, "_create", [ xref, ref, diff --git a/src/core/catalog.js b/src/core/catalog.js index dcc509c91..3809d1584 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -1387,6 +1387,22 @@ class Catalog { return next(pageRef); } + get baseUrl() { + const uri = this._catDict.get("URI"); + if (uri instanceof Dict) { + const base = uri.get("Base"); + if (typeof base === "string") { + const absoluteUrl = createValidAbsoluteUrl(base, null, { + tryConvertEncoding: true, + }); + if (absoluteUrl) { + return shadow(this, "baseUrl", absoluteUrl.href); + } + } + } + return shadow(this, "baseUrl", null); + } + /** * @typedef {Object} ParseDestDictionaryParameters * @property {Dict} destDict - The dictionary containing the destination. @@ -1464,8 +1480,6 @@ class Catalog { // Some bad PDFs do not put parentheses around relative URLs. url = "/" + url.name; } - // TODO: pdf spec mentions urls can be relative to a Base - // entry in the dictionary. break; case "GoTo": diff --git a/src/core/pdf_manager.js b/src/core/pdf_manager.js index d9e444992..d89f643a4 100644 --- a/src/core/pdf_manager.js +++ b/src/core/pdf_manager.js @@ -13,7 +13,12 @@ * limitations under the License. */ -import { createValidAbsoluteUrl, unreachable, warn } from "../shared/util.js"; +import { + createValidAbsoluteUrl, + shadow, + unreachable, + warn, +} from "../shared/util.js"; import { ChunkedStreamManager } from "./chunked_stream.js"; import { MissingDataException } from "./core_utils.js"; import { PDFDocument } from "./document.js"; @@ -46,7 +51,8 @@ class BasePdfManager { } get docBaseUrl() { - return this._docBaseUrl; + const catalog = this.pdfDocument.catalog; + return shadow(this, "docBaseUrl", catalog.baseUrl || this._docBaseUrl); } onLoadedStream() { diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 71d3bbb2b..2306c1674 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -238,6 +238,7 @@ !pdfjsbad1586.pdf !standard_fonts.pdf !freeculture.pdf +!issue14802.pdf !issue6006.pdf !pdfkit_compressed.pdf !TAMReview.pdf diff --git a/test/pdfs/issue14802.pdf b/test/pdfs/issue14802.pdf new file mode 100644 index 000000000..b939b65a3 --- /dev/null +++ b/test/pdfs/issue14802.pdf @@ -0,0 +1,89 @@ +%PDF-1.7 +%âãÏÓ +1 0 obj +<< +/Pages 2 0 R +/URI +<< +/Base (http://example.com/) +>> +/Type /Catalog +>> +endobj +2 0 obj +<< +/Kids [3 0 R] +/Type /Pages +/Count 1 +>> +endobj +3 0 obj +<< +/Parent 2 0 R +/Annots [4 0 R] +/Resources +<< +/Font +<< +/F1 5 0 R +>> +>> +/MediaBox [0 0 260 50] +/Type /Page +/Contents 6 0 R +>> +endobj +4 0 obj +<< +/Border [0 0 1] +/Subtype /Link +/C [0 0 1] +/A +<< +/URI (./relative_link.txt) +/Type /Action +/S /URI +>> +/Type /Annot +/Rect [5 10 250 40] +>> +endobj +5 0 obj +<< +/BaseFont /Times-Roman +/Subtype /Type1 +/Type /Font +/Encoding /WinAnsiEncoding +>> +endobj +6 0 obj +<< +/Length 81 +>> +stream +1 0 0 rg +BT +10 20 TD +/F1 14 Tf +(A relative link, with a /Catalog Base-URI) Tj +ET + +endstream +endobj xref +0 7 +0000000000 65535 f +0000000015 00000 n +0000000106 00000 n +0000000165 00000 n +0000000310 00000 n +0000000467 00000 n +0000000568 00000 n +trailer + +<< +/Root 1 0 R +/Size 7 +>> +startxref +701 +%%EOF diff --git a/test/test_manifest.json b/test/test_manifest.json index 298b28396..9465e2712 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -2950,6 +2950,14 @@ "link": true, "type": "eq" }, + { "id": "issue14802", + "file": "pdfs/issue14802.pdf", + "md5": "c1e774945fee539c7fcfec00b36dd4e6", + "rounds": 1, + "type": "eq", + "annotations": true, + "about": "LinkAnnotation with a relative link, and a /Catalog Base-URI." + }, { "id": "issue1127-text", "file": "pdfs/issue1127.pdf", "md5": "4fb2be5ffefeafda4ba977de2a1bb4d8",