Extract one page after the other and not all pages at once
This commit is contained in:
parent
3c77291013
commit
c9fb5637c3
29
src/core.js
29
src/core.js
@ -698,6 +698,9 @@ var PDFDoc = (function PDFDocClosure() {
|
|||||||
this.fontsLoading = {};
|
this.fontsLoading = {};
|
||||||
this.workerReadyPromise = new Promise('workerReady');
|
this.workerReadyPromise = new Promise('workerReady');
|
||||||
|
|
||||||
|
this.pageText = [];
|
||||||
|
this.startedTextExtraction = false;
|
||||||
|
|
||||||
// If worker support isn't disabled explicit and the browser has worker
|
// If worker support isn't disabled explicit and the browser has worker
|
||||||
// support, create a new web worker and test if it/the browser fullfills
|
// support, create a new web worker and test if it/the browser fullfills
|
||||||
// all requirements to run parts of pdf.js in a web worker.
|
// all requirements to run parts of pdf.js in a web worker.
|
||||||
@ -769,7 +772,6 @@ var PDFDoc = (function PDFDocClosure() {
|
|||||||
WorkerMessageHandler.setup(messageHandler);
|
WorkerMessageHandler.setup(messageHandler);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
||||||
setupMessageHandler: function PDFDoc_setupMessageHandler(messageHandler) {
|
setupMessageHandler: function PDFDoc_setupMessageHandler(messageHandler) {
|
||||||
this.messageHandler = messageHandler;
|
this.messageHandler = messageHandler;
|
||||||
|
|
||||||
@ -825,9 +827,18 @@ var PDFDoc = (function PDFDocClosure() {
|
|||||||
}, this);
|
}, this);
|
||||||
|
|
||||||
messageHandler.on('text_extracted', function pdfTextExtracted(data) {
|
messageHandler.on('text_extracted', function pdfTextExtracted(data) {
|
||||||
var index = data[0];
|
var pageNum = data[0];
|
||||||
|
var content = data[1];
|
||||||
|
if (pageNum !== this.pageText.length + 1)
|
||||||
|
error('pdfTextExtracted: pageIdx and pageText length got to fit');
|
||||||
|
|
||||||
|
this.pageText.push(content);
|
||||||
|
|
||||||
if (this.textExtracted)
|
if (this.textExtracted)
|
||||||
this.textExtracted(index);
|
this.textExtracted(pageNum, content);
|
||||||
|
|
||||||
|
if (pageNum < this.numPages)
|
||||||
|
this.extractTextPage(pageNum + 1);
|
||||||
}, this);
|
}, this);
|
||||||
|
|
||||||
messageHandler.on('jpeg_decode', function(data, promise) {
|
messageHandler.on('jpeg_decode', function(data, promise) {
|
||||||
@ -895,9 +906,19 @@ var PDFDoc = (function PDFDocClosure() {
|
|||||||
return (this.pageCache[n] = page);
|
return (this.pageCache[n] = page);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
extractTextPage: function PDFDoc_extractTextPage(pageNum) {
|
||||||
|
this.messageHandler.send('extract_text', pageNum);
|
||||||
|
},
|
||||||
|
|
||||||
extractText: function PDFDoc_extractText() {
|
extractText: function PDFDoc_extractText() {
|
||||||
|
if (this.startedTextExtraction)
|
||||||
|
return;
|
||||||
|
|
||||||
|
this.startedTextExtraction = true;
|
||||||
|
|
||||||
this.workerReadyPromise.then(function pdfDocStartRenderingThen() {
|
this.workerReadyPromise.then(function pdfDocStartRenderingThen() {
|
||||||
this.messageHandler.send('extract_text');
|
// Start the text extraction process.
|
||||||
|
this.extractTextPage(1);
|
||||||
}.bind(this));
|
}.bind(this));
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -94,7 +94,6 @@ var WorkerMessageHandler = {
|
|||||||
handler.on('page_request', function wphSetupPageRequest(pageNum) {
|
handler.on('page_request', function wphSetupPageRequest(pageNum) {
|
||||||
pageNum = parseInt(pageNum);
|
pageNum = parseInt(pageNum);
|
||||||
|
|
||||||
|
|
||||||
// The following code does quite the same as
|
// The following code does quite the same as
|
||||||
// Page.prototype.startRendering, but stops at one point and sends the
|
// Page.prototype.startRendering, but stops at one point and sends the
|
||||||
// result back to the main thread.
|
// result back to the main thread.
|
||||||
@ -156,37 +155,20 @@ var WorkerMessageHandler = {
|
|||||||
});
|
});
|
||||||
}, this);
|
}, this);
|
||||||
|
|
||||||
handler.on('extract_text', function wphExtractText() {
|
handler.on('extract_text', function wphExtractText(pageNum) {
|
||||||
var numPages = pdfModel.numPages;
|
|
||||||
var index = [];
|
|
||||||
var start = Date.now();
|
var start = Date.now();
|
||||||
|
|
||||||
function indexPage(pageNum) {
|
var textContent = '';
|
||||||
if (pageNum > numPages) {
|
try {
|
||||||
console.log('text indexing: time=%dms', Date.now() - start);
|
var page = pdfModel.getPage(pageNum);
|
||||||
|
textContent = page.extractTextContent();
|
||||||
handler.send('text_extracted', [index]);
|
} catch (e) {
|
||||||
return;
|
// Skip errored pages
|
||||||
}
|
|
||||||
|
|
||||||
var textContent = '';
|
|
||||||
// try {
|
|
||||||
var page = pdfModel.getPage(pageNum);
|
|
||||||
textContent = page.extractTextContent();
|
|
||||||
// } catch (e) {
|
|
||||||
// // Skip errored pages
|
|
||||||
// }
|
|
||||||
|
|
||||||
index.push(textContent);
|
|
||||||
|
|
||||||
// processing one page, interrupting thread to process
|
|
||||||
// other requests
|
|
||||||
setTimeout(function extractTextNextPage() {
|
|
||||||
indexPage(pageNum + 1);
|
|
||||||
}, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
indexPage(1);
|
console.log('text indexing: page=%d - time=%dms',
|
||||||
|
pageNum, Date.now() - start);
|
||||||
|
handler.send('text_extracted', [pageNum, textContent]);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -491,7 +491,7 @@ var PDFView = {
|
|||||||
|
|
||||||
var pdf;
|
var pdf;
|
||||||
try {
|
try {
|
||||||
pdf = new PDFJS.PDFDoc(data);
|
this.pdfDoc = pdf = new PDFJS.PDFDoc(data);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
this.error('An error occurred while reading the PDF.', e);
|
this.error('An error occurred while reading the PDF.', e);
|
||||||
}
|
}
|
||||||
@ -576,22 +576,18 @@ var PDFView = {
|
|||||||
|
|
||||||
if (pdfTitle)
|
if (pdfTitle)
|
||||||
document.title = pdfTitle + ' - ' + document.title;
|
document.title = pdfTitle + ' - ' + document.title;
|
||||||
|
|
||||||
// loosing pdf reference here, starting text indexing in 500ms
|
|
||||||
setTimeout((function loadStartTextExtraction() {
|
|
||||||
this.startTextExtraction(pdf);
|
|
||||||
}).bind(this), 500);
|
|
||||||
delete PDFView.extractedText;
|
|
||||||
},
|
},
|
||||||
|
|
||||||
startTextExtraction: function pdfViewStartTextExtraction(pdf) {
|
startTextExtraction: function pdfViewStartTextExtraction(pdf) {
|
||||||
var searchResults = document.getElementById('searchResults');
|
var searchResults = document.getElementById('searchResults');
|
||||||
searchResults.textContent = '';
|
searchResults.textContent = '';
|
||||||
|
|
||||||
pdf.textExtracted = function pdfTextExtracted(index) {
|
pdf.textExtracted = (function pdfTextExtracted(pageIdx, content) {
|
||||||
PDFView.extractedText = index;
|
this.search();
|
||||||
};
|
}).bind(this);
|
||||||
pdf.extractText();
|
pdf.extractText();
|
||||||
|
|
||||||
|
this.pdfDoc = pdf;
|
||||||
},
|
},
|
||||||
|
|
||||||
search: function pdfViewStartSearch() {
|
search: function pdfViewStartSearch() {
|
||||||
@ -604,21 +600,19 @@ var PDFView = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var searchResults = document.getElementById('searchResults');
|
var searchResults = document.getElementById('searchResults');
|
||||||
if (!('extractedText' in PDFView)) {
|
|
||||||
// not indexed yet, repeat in 1 second
|
|
||||||
searchResults.textContent = 'Searching...';
|
|
||||||
setTimeout(this.search.bind(this), 1000);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
var searchTermsInput = document.getElementById('searchTermsInput');
|
var searchTermsInput = document.getElementById('searchTermsInput');
|
||||||
searchResults.removeAttribute('hidden');
|
searchResults.removeAttribute('hidden');
|
||||||
searchResults.textContent = '';
|
searchResults.textContent = '';
|
||||||
|
|
||||||
var terms = searchTermsInput.value;
|
var terms = searchTermsInput.value;
|
||||||
|
|
||||||
|
if (!terms)
|
||||||
|
return;
|
||||||
|
|
||||||
// simple search: removing spaces and hyphens, then scanning every
|
// simple search: removing spaces and hyphens, then scanning every
|
||||||
terms = terms.replace(/\s-/g, '').toLowerCase();
|
terms = terms.replace(/\s-/g, '').toLowerCase();
|
||||||
var index = PDFView.extractedText;
|
var index = PDFView.pdfDoc.pageText;
|
||||||
var pageFound = false;
|
var pageFound = false;
|
||||||
for (var i = 0, ii = index.length; i < ii; i++) {
|
for (var i = 0, ii = index.length; i < ii; i++) {
|
||||||
var pageText = index[i].replace(/\s-/g, '').toLowerCase();
|
var pageText = index[i].replace(/\s-/g, '').toLowerCase();
|
||||||
@ -708,6 +702,9 @@ var PDFView = {
|
|||||||
|
|
||||||
var searchTermsInput = document.getElementById('searchTermsInput');
|
var searchTermsInput = document.getElementById('searchTermsInput');
|
||||||
searchTermsInput.focus();
|
searchTermsInput.focus();
|
||||||
|
|
||||||
|
// Start text extraction as soon as the search gets displayed.
|
||||||
|
this.pdfDoc.extractText();
|
||||||
} else {
|
} else {
|
||||||
searchScrollView.setAttribute('hidden', 'true');
|
searchScrollView.setAttribute('hidden', 'true');
|
||||||
searchSwitchButton.removeAttribute('data-selected');
|
searchSwitchButton.removeAttribute('data-selected');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user