Text char codes extraction
This commit is contained in:
parent
853f16085f
commit
3b72c6063c
50
src/core.js
50
src/core.js
@ -200,10 +200,12 @@ var Page = (function PageClosure() {
|
||||
if (isArray(content)) {
|
||||
// fetching items
|
||||
var i, n = content.length;
|
||||
var streams = [];
|
||||
for (i = 0; i < n; ++i)
|
||||
content[i] = xref.fetchIfRef(content[i]);
|
||||
content = new StreamsSequenceStream(content);
|
||||
}
|
||||
streams.push(xref.fetchIfRef(content[i]));
|
||||
content = new StreamsSequenceStream(streams);
|
||||
} else if (isStream(content))
|
||||
content.pos = 0;
|
||||
|
||||
var pe = this.pe = new PartialEvaluator(
|
||||
xref, handler, 'p' + this.pageNumber + '_');
|
||||
@ -212,6 +214,36 @@ var Page = (function PageClosure() {
|
||||
dependency));
|
||||
},
|
||||
|
||||
extractTextContent: function pageExtractPageContent() {
|
||||
if ('textContent' in this) {
|
||||
// text content was extracted
|
||||
return this.textContent;
|
||||
}
|
||||
|
||||
var handler = {
|
||||
on: function () {},
|
||||
send: function() {}
|
||||
};
|
||||
|
||||
var xref = this.xref;
|
||||
var content = xref.fetchIfRef(this.content);
|
||||
var resources = xref.fetchIfRef(this.resources);
|
||||
if (isArray(content)) {
|
||||
// fetching items
|
||||
var i, n = content.length;
|
||||
var streams = [];
|
||||
for (i = 0; i < n; ++i)
|
||||
streams.push(xref.fetchIfRef(content[i]));
|
||||
content = new StreamsSequenceStream(streams);
|
||||
} else if (isStream(content))
|
||||
content.pos = 0;
|
||||
|
||||
var pe = new PartialEvaluator(
|
||||
xref, handler, 'p' + this.pageNumber + '_');
|
||||
var text = pe.getTextContent(content, resources);
|
||||
return (this.textContent = text);
|
||||
},
|
||||
|
||||
ensureFonts: function pageEnsureFonts(fonts, callback) {
|
||||
// Convert the font names to the corresponding font obj.
|
||||
for (var i = 0, ii = fonts.length; i < ii; i++) {
|
||||
@ -614,6 +646,12 @@ var PDFDoc = (function PDFDocClosure() {
|
||||
throw data.error;
|
||||
}, this);
|
||||
|
||||
messageHandler.on('text_extracted', function pdfDocError(data) {
|
||||
var index = data.index;
|
||||
if (this.textExtracted)
|
||||
this.textExtracted(index);
|
||||
}, this);
|
||||
|
||||
setTimeout(function pdfDocFontReadySetTimeout() {
|
||||
messageHandler.send('doc', this.data);
|
||||
this.workerReadyPromise.resolve(true);
|
||||
@ -643,6 +681,12 @@ var PDFDoc = (function PDFDocClosure() {
|
||||
return (this.pageCache[n] = page);
|
||||
},
|
||||
|
||||
extractText: function pdfDocExtractExtractText() {
|
||||
this.workerReadyPromise.then(function pdfDocStartRenderingThen() {
|
||||
this.messageHandler.send('extract_text');
|
||||
}.bind(this));
|
||||
},
|
||||
|
||||
destroy: function pdfDocDestroy() {
|
||||
if (this.worker)
|
||||
this.worker.terminate();
|
||||
|
@ -144,7 +144,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
fontRef = fontRef || fontRes.get(fontName);
|
||||
var font = xref.fetchIfRef(fontRef);
|
||||
assertWellFormed(isDict(font));
|
||||
if (!font.translated) {
|
||||
if (!font.loadedName) {
|
||||
font.translated = self.translateFont(font, xref, resources,
|
||||
dependency);
|
||||
if (font.translated) {
|
||||
@ -464,6 +464,65 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
};
|
||||
},
|
||||
|
||||
getTextContent: function partialEvaluatorGetIRQueue(stream, resources) {
|
||||
|
||||
var self = this;
|
||||
var xref = this.xref;
|
||||
|
||||
function handleSetFont(fontName, fontRef) {
|
||||
var fontRes = resources.get('Font');
|
||||
|
||||
// TODO: TOASK: Is it possible to get here? If so, what does
|
||||
// args[0].name should be like???
|
||||
assert(fontRes, 'fontRes not available');
|
||||
|
||||
fontRes = xref.fetchIfRef(fontRes);
|
||||
fontRef = fontRef || fontRes.get(fontName);
|
||||
var font = xref.fetchIfRef(fontRef), tra;
|
||||
assertWellFormed(isDict(font));
|
||||
if (!font.translated) {
|
||||
font.translated = self.translateFont(font, xref, resources);
|
||||
}
|
||||
return font;
|
||||
}
|
||||
|
||||
resources = xref.fetchIfRef(resources) || new Dict();
|
||||
|
||||
var parser = new Parser(new Lexer(stream), false);
|
||||
var res = resources;
|
||||
var args = [], obj;
|
||||
|
||||
var text = '';
|
||||
var font = null;
|
||||
while (!isEOF(obj = parser.getObj())) {
|
||||
if (isCmd(obj)) {
|
||||
var cmd = obj.cmd;
|
||||
switch (cmd) {
|
||||
case 'Tf':
|
||||
font = handleSetFont(args[0].name);
|
||||
break;
|
||||
case 'TJ':
|
||||
var items = args[0];
|
||||
for (var j = 0, jj = items.length; j < jj; j++) {
|
||||
if (typeof items[j] === 'string')
|
||||
text += items[j];
|
||||
}
|
||||
break;
|
||||
case 'Tj':
|
||||
text += args[0];
|
||||
break;
|
||||
} // switch
|
||||
|
||||
args = [];
|
||||
} else if (obj != null) {
|
||||
assertWellFormed(args.length <= 33, 'Too many arguments');
|
||||
args.push(obj);
|
||||
}
|
||||
}
|
||||
|
||||
return text;
|
||||
},
|
||||
|
||||
extractDataStructures: function
|
||||
partialEvaluatorExtractDataStructures(dict, baseDict,
|
||||
xref, properties) {
|
||||
@ -837,15 +896,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
||||
|
||||
if (type.name === 'Type3') {
|
||||
properties.coded = true;
|
||||
var charProcs = xref.fetchIfRef(dict.get('CharProcs'));
|
||||
var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources;
|
||||
properties.resources = fontResources;
|
||||
properties.charProcIRQueues = {};
|
||||
for (var key in charProcs.map) {
|
||||
var glyphStream = xref.fetchIfRef(charProcs.map[key]);
|
||||
var queueObj = {};
|
||||
properties.charProcIRQueues[key] =
|
||||
this.getIRQueue(glyphStream, fontResources, queueObj, dependency);
|
||||
// read char procs only if dependency is specified
|
||||
if (dependency) {
|
||||
var charProcs = xref.fetchIfRef(dict.get('CharProcs'));
|
||||
var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources;
|
||||
properties.resources = fontResources;
|
||||
properties.charProcIRQueues = {};
|
||||
for (var key in charProcs.map) {
|
||||
var glyphStream = xref.fetchIfRef(charProcs.map[key]);
|
||||
var queueObj = {};
|
||||
properties.charProcIRQueues[key] =
|
||||
this.getIRQueue(glyphStream, fontResources, queueObj, dependency);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -160,6 +160,28 @@ var WorkerMessageHandler = {
|
||||
|
||||
handler.send('font_ready', [objId, obj]);
|
||||
});
|
||||
|
||||
handler.on('extract_text', function wphExtractText() {
|
||||
var numPages = pdfDoc.numPages;
|
||||
var index = [];
|
||||
for (var i = 0; i < numPages; i++) {
|
||||
var start = Date.now();
|
||||
|
||||
var textContent = '';
|
||||
try {
|
||||
var page = pdfDoc.getPage(i + 1);
|
||||
textContent = page.extractTextContent();
|
||||
} catch (e) {
|
||||
// Skip errored pages
|
||||
}
|
||||
|
||||
index.push(textContent);
|
||||
}
|
||||
|
||||
console.log('text indexing=: time=%dms', Date.now() - start);
|
||||
|
||||
handler.send('text_extracted', { index: index });
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -309,6 +309,17 @@ var PDFView = {
|
||||
}
|
||||
else
|
||||
this.page = 1;
|
||||
|
||||
setTimeout((function loadStartTextExtraction() {
|
||||
this.startTextExtraction(pdf);
|
||||
}).bind(this), 500);
|
||||
},
|
||||
|
||||
startTextExtraction: function(pdf) {
|
||||
pdf.textExtracted = function pdfTextExtracted(index) {
|
||||
console.log(index.join());
|
||||
};
|
||||
pdf.extractText();
|
||||
},
|
||||
|
||||
setHash: function pdfViewSetHash(hash) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user