f73c9b75d9
Moreover, adjust one linked test case that did not conform to the standard Web Archive URL format and adjust one linked test case because the link was dead.
188 lines
5.4 KiB
JavaScript
188 lines
5.4 KiB
JavaScript
/*
|
|
* Copyright 2014 Mozilla Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
/* eslint-disable object-shorthand */
|
|
|
|
'use strict';
|
|
|
|
var fs = require('fs');
|
|
var crypto = require('crypto');
|
|
var http = require('http');
|
|
var https = require('https');
|
|
|
|
function rewriteWebArchiveUrl(url) {
|
|
// Web Archive URLs need to be transformed to add `if_` after the ID.
|
|
// Without this, an HTML page containing an iframe with the PDF file
|
|
// will be served instead (issue 8920).
|
|
var webArchiveRegex =
|
|
/(^https?:\/\/web\.archive\.org\/web\/)(\d+)(\/https?:\/\/.+)/g;
|
|
var urlParts = webArchiveRegex.exec(url);
|
|
if (urlParts) {
|
|
return urlParts[1] + (urlParts[2] + 'if_') + urlParts[3];
|
|
}
|
|
return url;
|
|
}
|
|
|
|
function downloadFile(file, url, callback, redirects) {
|
|
url = rewriteWebArchiveUrl(url);
|
|
|
|
var completed = false;
|
|
var protocol = /^https:\/\//.test(url) ? https : http;
|
|
protocol.get(url, function (response) {
|
|
var redirectTo;
|
|
if (response.statusCode === 301 || response.statusCode === 302 ||
|
|
response.statusCode === 307 || response.statusCode === 308) {
|
|
if (redirects > 10) {
|
|
callback('Too many redirects');
|
|
}
|
|
redirectTo = response.headers.location;
|
|
redirectTo = require('url').resolve(url, redirectTo);
|
|
downloadFile(file, redirectTo, callback, (redirects || 0) + 1);
|
|
return;
|
|
}
|
|
if (response.statusCode === 404 && url.indexOf('web.archive.org') < 0) {
|
|
// trying waybackmachine
|
|
redirectTo = 'http://web.archive.org/web/' + url;
|
|
downloadFile(file, redirectTo, callback, (redirects || 0) + 1);
|
|
return;
|
|
}
|
|
|
|
if (response.statusCode !== 200) {
|
|
if (!completed) {
|
|
completed = true;
|
|
callback('HTTP ' + response.statusCode);
|
|
}
|
|
return;
|
|
}
|
|
var stream = fs.createWriteStream(file);
|
|
stream.on('error', function (err) {
|
|
if (!completed) {
|
|
completed = true;
|
|
callback(err);
|
|
}
|
|
});
|
|
response.pipe(stream);
|
|
stream.on('finish', function() {
|
|
stream.close();
|
|
if (!completed) {
|
|
completed = true;
|
|
callback();
|
|
}
|
|
});
|
|
}).on('error', function (err) {
|
|
if (!completed) {
|
|
if (typeof err === 'object' && err.errno === 'ENOTFOUND' &&
|
|
url.indexOf('web.archive.org') < 0) {
|
|
// trying waybackmachine
|
|
var redirectTo = 'http://web.archive.org/web/' + url;
|
|
downloadFile(file, redirectTo, callback, (redirects || 0) + 1);
|
|
return;
|
|
}
|
|
completed = true;
|
|
callback(err);
|
|
}
|
|
});
|
|
}
|
|
|
|
function downloadManifestFiles(manifest, callback) {
|
|
function downloadNext() {
|
|
if (i >= links.length) {
|
|
callback();
|
|
return;
|
|
}
|
|
var file = links[i].file;
|
|
var url = links[i].url;
|
|
console.log('Downloading ' + url + ' to ' + file + '...');
|
|
downloadFile(file, url, function (err) {
|
|
if (err) {
|
|
console.error('Error during downloading of ' + url + ': ' + err);
|
|
fs.writeFileSync(file, ''); // making it empty file
|
|
fs.writeFileSync(file + '.error', err);
|
|
}
|
|
i++;
|
|
downloadNext();
|
|
});
|
|
}
|
|
|
|
var links = manifest.filter(function (item) {
|
|
return item.link && !fs.existsSync(item.file);
|
|
}).map(function (item) {
|
|
var file = item.file;
|
|
var linkfile = file + '.link';
|
|
var url = fs.readFileSync(linkfile).toString();
|
|
url = url.replace(/\s+$/, '');
|
|
return { file: file, url: url, };
|
|
});
|
|
|
|
var i = 0;
|
|
downloadNext();
|
|
}
|
|
|
|
function calculateMD5(file, callback) {
|
|
var hash = crypto.createHash('md5');
|
|
var stream = fs.createReadStream(file);
|
|
stream.on('data', function (data) {
|
|
hash.update(data);
|
|
});
|
|
stream.on('error', function (err) {
|
|
callback(err);
|
|
});
|
|
stream.on('end', function() {
|
|
var result = hash.digest('hex');
|
|
callback(null, result);
|
|
});
|
|
}
|
|
|
|
function verifyManifestFiles(manifest, callback) {
|
|
function verifyNext() {
|
|
if (i >= manifest.length) {
|
|
callback(error);
|
|
return;
|
|
}
|
|
var item = manifest[i];
|
|
if (fs.existsSync(item.file + '.error')) {
|
|
console.error('WARNING: File was not downloaded. See "' +
|
|
item.file + '.error" file.');
|
|
error = true;
|
|
i++;
|
|
verifyNext();
|
|
return;
|
|
}
|
|
calculateMD5(item.file, function (err, md5) {
|
|
if (err) {
|
|
console.log('WARNING: Unable to open file for reading "' + err + '".');
|
|
error = true;
|
|
} else if (!item.md5) {
|
|
console.error('WARNING: Missing md5 for file "' + item.file + '". ' +
|
|
'Hash for current file is "' + md5 + '"');
|
|
error = true;
|
|
} else if (md5 !== item.md5) {
|
|
console.error('WARNING: MD5 of file "' + item.file +
|
|
'" does not match file. Expected "' +
|
|
item.md5 + '" computed "' + md5 + '"');
|
|
error = true;
|
|
}
|
|
i++;
|
|
verifyNext();
|
|
});
|
|
}
|
|
var i = 0;
|
|
var error = false;
|
|
verifyNext();
|
|
}
|
|
|
|
exports.downloadManifestFiles = downloadManifestFiles;
|
|
exports.verifyManifestFiles = verifyManifestFiles;
|