diff --git a/src/core.js b/src/core.js index 9f3e6b837..6a90357e8 100644 --- a/src/core.js +++ b/src/core.js @@ -200,10 +200,12 @@ var Page = (function PageClosure() { if (isArray(content)) { // fetching items var i, n = content.length; + var streams = []; for (i = 0; i < n; ++i) - content[i] = xref.fetchIfRef(content[i]); - content = new StreamsSequenceStream(content); - } + streams.push(xref.fetchIfRef(content[i])); + content = new StreamsSequenceStream(streams); + } else if (isStream(content)) + content.pos = 0; var pe = this.pe = new PartialEvaluator( xref, handler, 'p' + this.pageNumber + '_'); @@ -212,6 +214,36 @@ var Page = (function PageClosure() { dependency)); }, + extractTextContent: function pageExtractPageContent() { + if ('textContent' in this) { + // text content was extracted + return this.textContent; + } + + var handler = { + on: function () {}, + send: function() {} + }; + + var xref = this.xref; + var content = xref.fetchIfRef(this.content); + var resources = xref.fetchIfRef(this.resources); + if (isArray(content)) { + // fetching items + var i, n = content.length; + var streams = []; + for (i = 0; i < n; ++i) + streams.push(xref.fetchIfRef(content[i])); + content = new StreamsSequenceStream(streams); + } else if (isStream(content)) + content.pos = 0; + + var pe = new PartialEvaluator( + xref, handler, 'p' + this.pageNumber + '_'); + var text = pe.getTextContent(content, resources); + return (this.textContent = text); + }, + ensureFonts: function pageEnsureFonts(fonts, callback) { // Convert the font names to the corresponding font obj. for (var i = 0, ii = fonts.length; i < ii; i++) { @@ -614,6 +646,12 @@ var PDFDoc = (function PDFDocClosure() { throw data.error; }, this); + messageHandler.on('text_extracted', function pdfDocError(data) { + var index = data.index; + if (this.textExtracted) + this.textExtracted(index); + }, this); + setTimeout(function pdfDocFontReadySetTimeout() { messageHandler.send('doc', this.data); this.workerReadyPromise.resolve(true); @@ -643,6 +681,12 @@ var PDFDoc = (function PDFDocClosure() { return (this.pageCache[n] = page); }, + extractText: function pdfDocExtractExtractText() { + this.workerReadyPromise.then(function pdfDocStartRenderingThen() { + this.messageHandler.send('extract_text'); + }.bind(this)); + }, + destroy: function pdfDocDestroy() { if (this.worker) this.worker.terminate(); diff --git a/src/evaluator.js b/src/evaluator.js index a5ca627c5..588da5084 100644 --- a/src/evaluator.js +++ b/src/evaluator.js @@ -144,7 +144,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { fontRef = fontRef || fontRes.get(fontName); var font = xref.fetchIfRef(fontRef); assertWellFormed(isDict(font)); - if (!font.translated) { + if (!font.loadedName) { font.translated = self.translateFont(font, xref, resources, dependency); if (font.translated) { @@ -464,6 +464,65 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }; }, + getTextContent: function partialEvaluatorGetIRQueue(stream, resources) { + + var self = this; + var xref = this.xref; + + function handleSetFont(fontName, fontRef) { + var fontRes = resources.get('Font'); + + // TODO: TOASK: Is it possible to get here? If so, what does + // args[0].name should be like??? + assert(fontRes, 'fontRes not available'); + + fontRes = xref.fetchIfRef(fontRes); + fontRef = fontRef || fontRes.get(fontName); + var font = xref.fetchIfRef(fontRef), tra; + assertWellFormed(isDict(font)); + if (!font.translated) { + font.translated = self.translateFont(font, xref, resources); + } + return font; + } + + resources = xref.fetchIfRef(resources) || new Dict(); + + var parser = new Parser(new Lexer(stream), false); + var res = resources; + var args = [], obj; + + var text = ''; + var font = null; + while (!isEOF(obj = parser.getObj())) { + if (isCmd(obj)) { + var cmd = obj.cmd; + switch (cmd) { + case 'Tf': + font = handleSetFont(args[0].name); + break; + case 'TJ': + var items = args[0]; + for (var j = 0, jj = items.length; j < jj; j++) { + if (typeof items[j] === 'string') + text += items[j]; + } + break; + case 'Tj': + text += args[0]; + break; + } // switch + + args = []; + } else if (obj != null) { + assertWellFormed(args.length <= 33, 'Too many arguments'); + args.push(obj); + } + } + + return text; + }, + extractDataStructures: function partialEvaluatorExtractDataStructures(dict, baseDict, xref, properties) { @@ -837,15 +896,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { if (type.name === 'Type3') { properties.coded = true; - var charProcs = xref.fetchIfRef(dict.get('CharProcs')); - var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; - properties.resources = fontResources; - properties.charProcIRQueues = {}; - for (var key in charProcs.map) { - var glyphStream = xref.fetchIfRef(charProcs.map[key]); - var queueObj = {}; - properties.charProcIRQueues[key] = - this.getIRQueue(glyphStream, fontResources, queueObj, dependency); + // read char procs only if dependency is specified + if (dependency) { + var charProcs = xref.fetchIfRef(dict.get('CharProcs')); + var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; + properties.resources = fontResources; + properties.charProcIRQueues = {}; + for (var key in charProcs.map) { + var glyphStream = xref.fetchIfRef(charProcs.map[key]); + var queueObj = {}; + properties.charProcIRQueues[key] = + this.getIRQueue(glyphStream, fontResources, queueObj, dependency); + } } } diff --git a/src/worker.js b/src/worker.js index 52e631c92..c3992e54f 100644 --- a/src/worker.js +++ b/src/worker.js @@ -160,6 +160,28 @@ var WorkerMessageHandler = { handler.send('font_ready', [objId, obj]); }); + + handler.on('extract_text', function wphExtractText() { + var numPages = pdfDoc.numPages; + var index = []; + for (var i = 0; i < numPages; i++) { + var start = Date.now(); + + var textContent = ''; + try { + var page = pdfDoc.getPage(i + 1); + textContent = page.extractTextContent(); + } catch (e) { + // Skip errored pages + } + + index.push(textContent); + } + + console.log('text indexing=: time=%dms', Date.now() - start); + + handler.send('text_extracted', { index: index }); + }); } }; diff --git a/web/viewer.js b/web/viewer.js index bdcac09d5..465df5ab5 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -309,6 +309,17 @@ var PDFView = { } else this.page = 1; + + setTimeout((function loadStartTextExtraction() { + this.startTextExtraction(pdf); + }).bind(this), 500); + }, + + startTextExtraction: function(pdf) { + pdf.textExtracted = function pdfTextExtracted(index) { + console.log(index.join()); + }; + pdf.extractText(); }, setHash: function pdfViewSetHash(hash) {