d4309614f9
The `DOMParser` is most likely overkill and may be less secure. Moreover, it is not supported in Node.js environments. This patch replaces the `DOMParser` with a simple XML parser. This should be faster and gives us Node.js support for free. The simple XML parser is a port of the one that existed in the examples folder with a small regex fix to make the parsing work correctly. The unit tests are extended for increased test coverage of the metadata code. The new method `getAll` is provided so the example does not have to access internal properties of the object anymore.
69 lines
2.3 KiB
JavaScript
69 lines
2.3 KiB
JavaScript
/* Any copyright is dedicated to the Public Domain.
|
|
* http://creativecommons.org/publicdomain/zero/1.0/ */
|
|
|
|
//
|
|
// Basic node example that prints document metadata and text content.
|
|
// Requires single file built version of PDF.js -- please run
|
|
// `gulp singlefile` before running the example.
|
|
//
|
|
|
|
var fs = require('fs');
|
|
|
|
// Run `gulp dist-install` to generate 'pdfjs-dist' npm package files.
|
|
var pdfjsLib = require('pdfjs-dist');
|
|
|
|
// Loading file from file system into typed array
|
|
var pdfPath = process.argv[2] || '../../web/compressed.tracemonkey-pldi-09.pdf';
|
|
|
|
// Will be using promises to load document, pages and misc data instead of
|
|
// callback.
|
|
pdfjsLib.getDocument(pdfPath).then(function (doc) {
|
|
var numPages = doc.numPages;
|
|
console.log('# Document Loaded');
|
|
console.log('Number of Pages: ' + numPages);
|
|
console.log();
|
|
|
|
var lastPromise; // will be used to chain promises
|
|
lastPromise = doc.getMetadata().then(function (data) {
|
|
console.log('# Metadata Is Loaded');
|
|
console.log('## Info');
|
|
console.log(JSON.stringify(data.info, null, 2));
|
|
console.log();
|
|
if (data.metadata) {
|
|
console.log('## Metadata');
|
|
console.log(JSON.stringify(data.metadata.getAll(), null, 2));
|
|
console.log();
|
|
}
|
|
});
|
|
|
|
var loadPage = function (pageNum) {
|
|
return doc.getPage(pageNum).then(function (page) {
|
|
console.log('# Page ' + pageNum);
|
|
var viewport = page.getViewport(1.0 /* scale */);
|
|
console.log('Size: ' + viewport.width + 'x' + viewport.height);
|
|
console.log();
|
|
return page.getTextContent().then(function (content) {
|
|
// Content contains lots of information about the text layout and
|
|
// styles, but we need only strings at the moment
|
|
var strings = content.items.map(function (item) {
|
|
return item.str;
|
|
});
|
|
console.log('## Text Content');
|
|
console.log(strings.join(' '));
|
|
}).then(function () {
|
|
console.log();
|
|
});
|
|
})
|
|
};
|
|
// Loading of the first page will wait on metadata and subsequent loadings
|
|
// will wait on the previous pages.
|
|
for (var i = 1; i <= numPages; i++) {
|
|
lastPromise = lastPromise.then(loadPage.bind(null, i));
|
|
}
|
|
return lastPromise;
|
|
}).then(function () {
|
|
console.log('# End of Document');
|
|
}, function (err) {
|
|
console.error('Error: ' + err);
|
|
});
|