Merge pull request #8979 from timvandermeij/downloads

Transform Web Archive URLs to avoid downloading an HTML page instead of the PDF file
This commit is contained in:
Jonas Jenwald 2017-09-30 22:47:23 +02:00 committed by GitHub
commit f9ce904fb7
3 changed files with 17 additions and 2 deletions

View File

@ -22,7 +22,22 @@ var crypto = require('crypto');
var http = require('http');
var https = require('https');
function rewriteWebArchiveUrl(url) {
// Web Archive URLs need to be transformed to add `if_` after the ID.
// Without this, an HTML page containing an iframe with the PDF file
// will be served instead (issue 8920).
var webArchiveRegex =
/(^https?:\/\/web\.archive\.org\/web\/)(\d+)(\/https?:\/\/.+)/g;
var urlParts = webArchiveRegex.exec(url);
if (urlParts) {
return urlParts[1] + (urlParts[2] + 'if_') + urlParts[3];
}
return url;
}
function downloadFile(file, url, callback, redirects) {
url = rewriteWebArchiveUrl(url);
var completed = false;
var protocol = /^https:\/\//.test(url) ? https : http;
protocol.get(url, function (response) {

View File

@ -1 +1 @@
http://web.archive.org/web/20150212141833/http://geothermal.inel.gov/publications/future_of_geothermal_energy.pdf
https://web.archive.org/web/20170930174755/https://www.pdf-archive.com/2017/09/30/future-of-geothermal-energy/future-of-geothermal-energy.pdf

View File

@ -1 +1 @@
http://web.archive.org/save/_embed/http://210.243.166.143/prob1.pdf
https://web.archive.org/web/20170930161657/http://210.243.166.143/prob1.pdf