Transform Web Archive URLs to avoid downloading an HTML page instead of the PDF file
Moreover, adjust one linked test case that did not conform to the standard Web Archive URL format and adjust one linked test case because the link was dead.
This commit is contained in:
parent
3717757b39
commit
f73c9b75d9
@ -22,7 +22,22 @@ var crypto = require('crypto');
|
||||
var http = require('http');
|
||||
var https = require('https');
|
||||
|
||||
function rewriteWebArchiveUrl(url) {
|
||||
// Web Archive URLs need to be transformed to add `if_` after the ID.
|
||||
// Without this, an HTML page containing an iframe with the PDF file
|
||||
// will be served instead (issue 8920).
|
||||
var webArchiveRegex =
|
||||
/(^https?:\/\/web\.archive\.org\/web\/)(\d+)(\/https?:\/\/.+)/g;
|
||||
var urlParts = webArchiveRegex.exec(url);
|
||||
if (urlParts) {
|
||||
return urlParts[1] + (urlParts[2] + 'if_') + urlParts[3];
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
function downloadFile(file, url, callback, redirects) {
|
||||
url = rewriteWebArchiveUrl(url);
|
||||
|
||||
var completed = false;
|
||||
var protocol = /^https:\/\//.test(url) ? https : http;
|
||||
protocol.get(url, function (response) {
|
||||
|
@ -1 +1 @@
|
||||
http://web.archive.org/web/20150212141833/http://geothermal.inel.gov/publications/future_of_geothermal_energy.pdf
|
||||
https://web.archive.org/web/20170930174755/https://www.pdf-archive.com/2017/09/30/future-of-geothermal-energy/future-of-geothermal-energy.pdf
|
||||
|
@ -1 +1 @@
|
||||
http://web.archive.org/save/_embed/http://210.243.166.143/prob1.pdf
|
||||
https://web.archive.org/web/20170930161657/http://210.243.166.143/prob1.pdf
|
||||
|
Loading…
Reference in New Issue
Block a user