Transform Web Archive URLs to avoid downloading an HTML page instead of the PDF file

Moreover, adjust one linked test case that did not conform to the
standard Web Archive URL format and adjust one linked test case because
the link was dead.
This commit is contained in:
Tim van der Meij 2017-09-30 17:14:41 +02:00
parent 3717757b39
commit f73c9b75d9
No known key found for this signature in database
GPG Key ID: 8C3FD2925A5F2762
3 changed files with 17 additions and 2 deletions

View File

@ -22,7 +22,22 @@ var crypto = require('crypto');
var http = require('http');
var https = require('https');
function rewriteWebArchiveUrl(url) {
// Web Archive URLs need to be transformed to add `if_` after the ID.
// Without this, an HTML page containing an iframe with the PDF file
// will be served instead (issue 8920).
var webArchiveRegex =
/(^https?:\/\/web\.archive\.org\/web\/)(\d+)(\/https?:\/\/.+)/g;
var urlParts = webArchiveRegex.exec(url);
if (urlParts) {
return urlParts[1] + (urlParts[2] + 'if_') + urlParts[3];
}
return url;
}
function downloadFile(file, url, callback, redirects) {
url = rewriteWebArchiveUrl(url);
var completed = false;
var protocol = /^https:\/\//.test(url) ? https : http;
protocol.get(url, function (response) {

View File

@ -1 +1 @@
http://web.archive.org/web/20150212141833/http://geothermal.inel.gov/publications/future_of_geothermal_energy.pdf
https://web.archive.org/web/20170930174755/https://www.pdf-archive.com/2017/09/30/future-of-geothermal-energy/future-of-geothermal-energy.pdf

View File

@ -1 +1 @@
http://web.archive.org/save/_embed/http://210.243.166.143/prob1.pdf
https://web.archive.org/web/20170930161657/http://210.243.166.143/prob1.pdf