pdf2svg.js: Serialize the SVG to a stream
Implement a serialization "generator" for `DOMElement` in domutils.js that yields the serialization of the SVG element. This method is used by a newly added `ReadableSVGStream` class, which can be used like any other readable stream in Node.js. This reduces the memory requirements. Now, it is not needed to require the serialization to fully fit in memory. Note: The implementation of the serializer is a state machine in ES5 since the rest of the file is also in ES5. Its functionality is equivalent to: ``` function* serializeSVGElement(elem) { yield '<' + elem.nodeName; if (elem.nodeName === 'svg:svg') { yield ' xmlns:xlink="http://www.w3.org/1999/xlink"' + ' xmlns:svg="http://www.w3.org/2000/svg"'; } for (let i in elem.attributes) { yield ' ' + i + '="' + xmlEncode(elem.attributes[i]) + '"'; } yield '>'; if (elem.nodeName === 'svg:tspan' || elem.nodeName === 'svg:style') { yield xmlEncode(elem.textContent); } else { for (let childNode of elem.childNodes) { yield* serializeSVGElement(childNode); } } yield '</' + elem.nodeName + '>'; } ```
This commit is contained in:
parent
ba5dbc9632
commit
9b5086d649
@ -91,30 +91,6 @@ DOMElement.prototype = {
|
||||
}
|
||||
},
|
||||
|
||||
toString: function DOMElement_toString() {
|
||||
var buf = [];
|
||||
buf.push('<' + this.nodeName);
|
||||
if (this.nodeName === 'svg:svg') {
|
||||
buf.push(' xmlns:xlink="http://www.w3.org/1999/xlink"' +
|
||||
' xmlns:svg="http://www.w3.org/2000/svg"');
|
||||
}
|
||||
for (var i in this.attributes) {
|
||||
buf.push(' ' + i + '="' + xmlEncode(this.attributes[i]) + '"');
|
||||
}
|
||||
|
||||
buf.push('>');
|
||||
|
||||
if (this.nodeName === 'svg:tspan' || this.nodeName === 'svg:style') {
|
||||
buf.push(xmlEncode(this.textContent));
|
||||
} else {
|
||||
this.childNodes.forEach(function(childNode) {
|
||||
buf.push(childNode.toString());
|
||||
});
|
||||
}
|
||||
buf.push('</' + this.nodeName + '>');
|
||||
return buf.join('');
|
||||
},
|
||||
|
||||
cloneNode: function DOMElement_cloneNode() {
|
||||
var newNode = new DOMElement(this.nodeName);
|
||||
newNode.childNodes = this.childNodes;
|
||||
@ -122,8 +98,95 @@ DOMElement.prototype = {
|
||||
newNode.textContent = this.textContent;
|
||||
return newNode;
|
||||
},
|
||||
|
||||
// This method is offered for convenience. It is recommended to directly use
|
||||
// getSerializer because that allows you to process the chunks as they come
|
||||
// instead of requiring the whole image to fit in memory.
|
||||
toString: function DOMElement_toString() {
|
||||
var buf = [];
|
||||
var serializer = this.getSerializer();
|
||||
var chunk;
|
||||
while ((chunk = serializer.getNext()) !== null) {
|
||||
buf.push(chunk);
|
||||
}
|
||||
return buf.join('');
|
||||
},
|
||||
|
||||
getSerializer: function DOMElement_getSerializer() {
|
||||
return new DOMElementSerializer(this);
|
||||
}
|
||||
}
|
||||
|
||||
function DOMElementSerializer(node) {
|
||||
this._node = node;
|
||||
this._state = 0;
|
||||
this._loopIndex = 0;
|
||||
this._attributeKeys = null;
|
||||
this._childSerializer = null;
|
||||
}
|
||||
DOMElementSerializer.prototype = {
|
||||
/**
|
||||
* Yields the next chunk in the serialization of the element.
|
||||
*
|
||||
* @returns {string|null} null if the element has fully been serialized.
|
||||
*/
|
||||
getNext: function DOMElementSerializer_getNext() {
|
||||
var node = this._node;
|
||||
switch (this._state) {
|
||||
case 0: // Start opening tag.
|
||||
++this._state;
|
||||
return '<' + node.nodeName;
|
||||
case 1: // Add SVG namespace if this is the root element.
|
||||
++this._state;
|
||||
if (node.nodeName === 'svg:svg') {
|
||||
return ' xmlns:xlink="http://www.w3.org/1999/xlink"' +
|
||||
' xmlns:svg="http://www.w3.org/2000/svg"';
|
||||
}
|
||||
case 2: // Initialize variables for looping over attributes.
|
||||
++this._state;
|
||||
this._loopIndex = 0;
|
||||
this._attributeKeys = Object.keys(node.attributes);
|
||||
case 3: // Serialize any attributes and end opening tag.
|
||||
if (this._loopIndex < this._attributeKeys.length) {
|
||||
var name = this._attributeKeys[this._loopIndex++];
|
||||
return ' ' + name + '="' + xmlEncode(node.attributes[name]) + '"';
|
||||
}
|
||||
++this._state;
|
||||
return '>';
|
||||
case 4: // Serialize textContent for tspan/style elements.
|
||||
if (node.nodeName === 'svg:tspan' || node.nodeName === 'svg:style') {
|
||||
this._state = 6;
|
||||
return xmlEncode(node.textContent);
|
||||
}
|
||||
++this._state;
|
||||
this._loopIndex = 0;
|
||||
case 5: // Serialize child nodes (only for non-tspan/style elements).
|
||||
var value;
|
||||
while (true) {
|
||||
value = this._childSerializer && this._childSerializer.getNext();
|
||||
if (value !== null) {
|
||||
return value;
|
||||
}
|
||||
var nextChild = node.childNodes[this._loopIndex++];
|
||||
if (nextChild) {
|
||||
this._childSerializer = new DOMElementSerializer(nextChild);
|
||||
} else {
|
||||
this._childSerializer = null;
|
||||
++this._state;
|
||||
break;
|
||||
}
|
||||
}
|
||||
case 6: // Ending tag.
|
||||
++this._state;
|
||||
return '</' + node.nodeName + '>';
|
||||
case 7: // Done.
|
||||
return null;
|
||||
default:
|
||||
throw new Error('Unexpected serialization state: ' + this._state);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
const document = {
|
||||
childNodes : [],
|
||||
|
||||
|
@ -6,6 +6,9 @@
|
||||
//
|
||||
|
||||
var fs = require('fs');
|
||||
var util = require('util');
|
||||
var path = require('path');
|
||||
var stream = require('stream');
|
||||
|
||||
// HACK few hacks to let PDF.js be loaded not as a module in global space.
|
||||
require('./domstubs.js').setStubs(global);
|
||||
@ -17,32 +20,66 @@ var pdfjsLib = require('pdfjs-dist');
|
||||
var pdfPath = process.argv[2] || '../../web/compressed.tracemonkey-pldi-09.pdf';
|
||||
var data = new Uint8Array(fs.readFileSync(pdfPath));
|
||||
|
||||
// Dumps svg outputs to a folder called svgdump
|
||||
function writeToFile(svgdump, pageNum, callback) {
|
||||
var name = getFileNameFromPath(pdfPath);
|
||||
fs.mkdir('./svgdump/', function(err) {
|
||||
if (!err || err.code === 'EEXIST') {
|
||||
fs.writeFile('./svgdump/' + name + "-" + pageNum + '.svg', svgdump,
|
||||
function(err) {
|
||||
if (err) {
|
||||
console.log('Error: ' + err);
|
||||
} else {
|
||||
console.log('Page: ' + pageNum);
|
||||
}
|
||||
callback();
|
||||
});
|
||||
} else {
|
||||
callback();
|
||||
}
|
||||
});
|
||||
var outputDirectory = './svgdump';
|
||||
|
||||
try {
|
||||
// Note: This creates a directory only one level deep. If you want to create
|
||||
// multiple subdirectories on the fly, use the mkdirp module from npm.
|
||||
fs.mkdirSync(outputDirectory);
|
||||
} catch (e) {
|
||||
if (e.code !== 'EEXIST') {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
// Get filename from the path
|
||||
// Dumps svg outputs to a folder called svgdump
|
||||
function getFilePathForPage(pageNum) {
|
||||
var name = path.basename(pdfPath, path.extname(pdfPath));
|
||||
return path.join(outputDirectory, name + '-' + pageNum + '.svg');
|
||||
}
|
||||
|
||||
function getFileNameFromPath(path) {
|
||||
var index = path.lastIndexOf('/');
|
||||
var extIndex = path.lastIndexOf('.');
|
||||
return path.substring(index, extIndex);
|
||||
/**
|
||||
* A readable stream which offers a stream representing the serialization of a
|
||||
* given DOM element (as defined by domstubs.js).
|
||||
*
|
||||
* @param {object} options
|
||||
* @param {DOMElement} options.svgElement The element to serialize
|
||||
*/
|
||||
function ReadableSVGStream(options) {
|
||||
if (!(this instanceof ReadableSVGStream)) {
|
||||
return new ReadableSVGStream(options);
|
||||
}
|
||||
stream.Readable.call(this, options);
|
||||
this.serializer = options.svgElement.getSerializer();
|
||||
}
|
||||
util.inherits(ReadableSVGStream, stream.Readable);
|
||||
// Implements https://nodejs.org/api/stream.html#stream_readable_read_size_1
|
||||
ReadableSVGStream.prototype._read = function() {
|
||||
var chunk;
|
||||
while ((chunk = this.serializer.getNext()) !== null) {
|
||||
if (!this.push(chunk)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
this.push(null);
|
||||
};
|
||||
|
||||
// Streams the SVG element to the given file path.
|
||||
function writeSvgToFile(svgElement, filePath) {
|
||||
var readableSvgStream = new ReadableSVGStream({
|
||||
svgElement: svgElement,
|
||||
});
|
||||
var writableStream = fs.createWriteStream(filePath);
|
||||
return new Promise(function(resolve, reject) {
|
||||
readableSvgStream.once('error', reject);
|
||||
writableStream.once('error', reject);
|
||||
writableStream.once('finish', resolve);
|
||||
readableSvgStream.pipe(writableStream);
|
||||
}).catch(function(err) {
|
||||
readableSvgStream = null; // Explicitly null because of v8 bug 6512.
|
||||
writableStream.end();
|
||||
throw err;
|
||||
});
|
||||
}
|
||||
|
||||
// Will be using promises to load document, pages and misc data instead of
|
||||
@ -69,13 +106,14 @@ pdfjsLib.getDocument({
|
||||
var svgGfx = new pdfjsLib.SVGGraphics(page.commonObjs, page.objs);
|
||||
svgGfx.embedFonts = true;
|
||||
return svgGfx.getSVG(opList, viewport).then(function (svg) {
|
||||
var svgDump = svg.toString();
|
||||
return new Promise(function(resolve) {
|
||||
writeToFile(svgDump, pageNum, resolve);
|
||||
return writeSvgToFile(svg, getFilePathForPage(pageNum)).then(function () {
|
||||
console.log('Page: ' + pageNum);
|
||||
}, function(err) {
|
||||
console.log('Error: ' + err);
|
||||
});
|
||||
});
|
||||
});
|
||||
})
|
||||
});
|
||||
};
|
||||
|
||||
for (var i = 1; i <= numPages; i++) {
|
||||
|
Loading…
Reference in New Issue
Block a user