From f73c9b75d9a540a913c0c42b5b11c009bf7418f4 Mon Sep 17 00:00:00 2001 From: Tim van der Meij Date: Sat, 30 Sep 2017 17:14:41 +0200 Subject: [PATCH] Transform Web Archive URLs to avoid downloading an HTML page instead of the PDF file Moreover, adjust one linked test case that did not conform to the standard Web Archive URL format and adjust one linked test case because the link was dead. --- test/downloadutils.js | 15 +++++++++++++++ test/pdfs/geothermal.pdf.link | 2 +- test/pdfs/issue8169.pdf.link | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/test/downloadutils.js b/test/downloadutils.js index ef1454161..df4c664fa 100644 --- a/test/downloadutils.js +++ b/test/downloadutils.js @@ -22,7 +22,22 @@ var crypto = require('crypto'); var http = require('http'); var https = require('https'); +function rewriteWebArchiveUrl(url) { + // Web Archive URLs need to be transformed to add `if_` after the ID. + // Without this, an HTML page containing an iframe with the PDF file + // will be served instead (issue 8920). + var webArchiveRegex = + /(^https?:\/\/web\.archive\.org\/web\/)(\d+)(\/https?:\/\/.+)/g; + var urlParts = webArchiveRegex.exec(url); + if (urlParts) { + return urlParts[1] + (urlParts[2] + 'if_') + urlParts[3]; + } + return url; +} + function downloadFile(file, url, callback, redirects) { + url = rewriteWebArchiveUrl(url); + var completed = false; var protocol = /^https:\/\//.test(url) ? https : http; protocol.get(url, function (response) { diff --git a/test/pdfs/geothermal.pdf.link b/test/pdfs/geothermal.pdf.link index 85cafd282..a1f2f5ef7 100644 --- a/test/pdfs/geothermal.pdf.link +++ b/test/pdfs/geothermal.pdf.link @@ -1 +1 @@ -http://web.archive.org/web/20150212141833/http://geothermal.inel.gov/publications/future_of_geothermal_energy.pdf +https://web.archive.org/web/20170930174755/https://www.pdf-archive.com/2017/09/30/future-of-geothermal-energy/future-of-geothermal-energy.pdf diff --git a/test/pdfs/issue8169.pdf.link b/test/pdfs/issue8169.pdf.link index da564680c..7b0a34230 100644 --- a/test/pdfs/issue8169.pdf.link +++ b/test/pdfs/issue8169.pdf.link @@ -1 +1 @@ -http://web.archive.org/save/_embed/http://210.243.166.143/prob1.pdf +https://web.archive.org/web/20170930161657/http://210.243.166.143/prob1.pdf